diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000000000..2b00788b1
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,90 @@
+BasedOnStyle: LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: true
+AlignEscapedNewlines: Right
+AlignOperands: true
+AllowAllArgumentsOnNextLine: false
+AllowAllConstructorInitializersOnNextLine: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLambdasOnASingleLine: All
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: false
+BinPackParameters: false
+BreakBeforeBraces: Custom
+BraceWrapping:
+  AfterCaseLabel: false
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterStruct: true
+  AfterUnion: true
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+BreakBeforeBinaryOperators: None
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializers: BeforeComma
+BreakInheritanceList: BeforeComma
+ColumnLimit: 100
+CompactNamespaces: false
+ContinuationIndentWidth: 2
+IncludeBlocks:   Regroup
+IncludeCategories:
+  - Regex:           '^<cub'
+    Priority:        1
+  - Regex:           '^<thrust'
+    Priority:        2
+  - Regex:           '^<cuda'
+    Priority:        3
+  - Regex:           '^<[a-z]*>$'
+    Priority:        4
+  - Regex:           '^<unittest'
+    Priority:        5    
+  - Regex:           '.*'
+    Priority:        6
+IndentCaseLabels: true
+IndentPPDirectives: None
+IndentWidth: 2
+KeepEmptyLinesAtTheStartOfBlocks: true
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+PenaltyBreakAssignment: 30
+PenaltyBreakBeforeFirstCallParameter: 50
+PenaltyBreakComment: 0
+PenaltyBreakFirstLessLess: 0
+PenaltyBreakString: 70
+PenaltyBreakTemplateDeclaration: 0
+PenaltyExcessCharacter: 100
+PenaltyReturnTypeOnItsOwnLine: 90
+PointerAlignment: Right
+ReflowComments: true
+SortIncludes: CaseInsensitive
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard: c++11
+TabWidth: 2
+UseTab: Never
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
new file mode 100644
index 000000000..68469e1f1
--- /dev/null
+++ b/.git-blame-ignore-revs
@@ -0,0 +1,16 @@
+# Exclude these commits from git-blame and similar tools.
+#
+# To use this file, run the following command from the repo root:
+#
+# ```
+# $ git config blame.ignoreRevsFile .git-blame-ignore-revs
+# ```
+#
+# Include a brief comment with each commit added, for example:
+#
+# ```
+# d92d9f8baac5ec48a8f8718dd69f415a45efe372 # Initial clang-format
+# ```
+#
+# Only add commits that are pure formatting changes (e.g.
+# clang-format version changes, etc).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..72def4091
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,5 @@
+blank_issues_enabled: false
+contact_links:
+  - name: Open Issue in CCCL Repository
+    url: https://github.com/NVIDIA/cccl/issues/new/choose
+    about:  This repository has moved! Please see the new home for Thrust. 
diff --git a/.github/workflows/deploy-documentation-github-pages.yml b/.github/workflows/deploy-documentation-github-pages.yml
new file mode 100644
index 000000000..508764c5c
--- /dev/null
+++ b/.github/workflows/deploy-documentation-github-pages.yml
@@ -0,0 +1,27 @@
+name: Deploy Documentation GitHub Pages
+
+on:
+  push:
+    branches:
+      - "main"
+
+  # Trigger on request.
+  workflow_dispatch:
+
+jobs:
+  deploy-documentation-github-pages:
+    runs-on: ubuntu-latest
+    container: gpuci/cccl:cuda11.7.0-devel-ubuntu20.04-gcc9
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Generate documentation markdown
+        run: ./docs/generate_markdown.bash --clean
+      - name: Deploy generated documentation markdown to gh-pages branch
+        uses: peaceiris/actions-gh-pages@v3
+        if: github.ref == 'refs/heads/main'
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_dir: ./build_docs/github_pages
+          enable_jekyll: true
+          commit_message: "Deploy Documentation: ${{ github.event.head_commit.message }}"
diff --git a/.github/workflows/mirror-main-branch-to-master-branch.yml b/.github/workflows/mirror-main-branch-to-master-branch.yml
new file mode 100644
index 000000000..f9c861a3f
--- /dev/null
+++ b/.github/workflows/mirror-main-branch-to-master-branch.yml
@@ -0,0 +1,17 @@
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  mirror-main-branch-to-master-branch:
+    name: Mirror main branch to master branch
+    runs-on: ubuntu-latest
+    steps:
+    - name: Mirror main branch to master branch
+      id: mirror
+      uses: google/mirror-branch-action@v1.0
+      with:
+        source: main
+        dest: master
+        github-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/xrefcheck-validate-markdown-links.yml b/.github/workflows/xrefcheck-validate-markdown-links.yml
new file mode 100755
index 000000000..78e5ade71
--- /dev/null
+++ b/.github/workflows/xrefcheck-validate-markdown-links.yml
@@ -0,0 +1,18 @@
+name: Check bad links
+
+on:
+  push:
+    branches: [ '*' ]
+  pull_request:
+    branches: [ '*' ]
+
+jobs:
+  xrefcheck:
+    name: Check links
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - uses: serokell/xrefcheck-action@v1
+      with:
+        xrefcheck-version: 0.2
+        xrefcheck-args: --ignored dependencies
diff --git a/.gitignore b/.gitignore
index 23c24885c..37d8ba566 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,4 @@
-targets/
-*.pyc
-*.bak
-*.swp
-*.sconsign.dblite
-*.pgm
-*~
+discrete_voronoi.pgm
+*build*/
+.idea/
+.vscode
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..0bb39f302
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "cub"]
+	path = dependencies/cub
+	url = ../cub.git
+[submodule "libcudacxx"]
+	path = dependencies/libcudacxx
+	url = ../libcudacxx.git
diff --git a/CHANGELOG b/CHANGELOG
deleted file mode 100644
index 1707982f7..000000000
--- a/CHANGELOG
+++ /dev/null
@@ -1,819 +0,0 @@
-#######################################
-#           Thrust v1.8.3             #
-#######################################
-
-Summary
-    Small bug fixes
-
-New Examples
-    range_view demonstrates use of a view: a non-owning wrapper for an iterator range with a container-like interface
-
-Bug Fixes
-    copy_if now copies in a user provided stream instead of a default_stream
-    {min,max,minmax}_element can now accept raw device pointer with device execution policy
-
-#######################################
-#           Thrust v1.8.2             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Avoid warnings and errors concerning user functions called from __host__ __device__ functions
-    #632 CUDA set_intersection_by_key error
-    #651 thrust::copy between host & device is not interoperable with thrust::cuda::par.on(stream)
-    #664 CUDA for_each ignores execution policy's stream
-
-Known Issues
-    #628 CUDA's reduce_by_key fails on sm_50 devices
-
-#######################################
-#           Thrust v1.8.1             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    #615 CUDA for_each accesses illegal memory locations when given a large range
-    #620 CUDA's reduce_by_key fails on large input
-
-Known Issues
-    #628 CUDA's reduce_by_key fails on sm_50 devices
-
-#######################################
-#           Thrust v1.8.0             #
-#######################################
-
-Summary
-    Thrust 1.8.0 introduces support for algorithm invocation from CUDA __device__ code, support for CUDA streams,
-    and algorithm performance improvements. Users may now invoke Thrust algorithms from CUDA __device__ code,
-    providing a parallel algorithms library to CUDA programmers authoring custom kernels, as well as allowing
-    Thrust programmers to nest their algorithm calls within functors. The thrust::seq execution policy
-    allows users to require sequential algorithm execution in the calling thread and makes a
-    sequential algorithms library available to individual CUDA threads. The .on(stream) syntax allows users to
-    request a CUDA stream for kernels launched during algorithm execution. Finally, new CUDA algorithm
-    implementations provide substantial performance improvements.
-
-Breaking API Changes
-    None.
-
-New Features
-    Algorithms in CUDA __device__ code
-      Thrust algorithms may now be invoked from CUDA __device__ and __host__ __device__ functions.
-
-      Algorithms invoked in this manner must be invoked with an execution policy as the first parameter:
-
-      __device__ int my_device_sort(int *data, size_t n)
-      {
-        thrust::sort(thrust::device, data, data + n);
-      }
-
-      The following execution policies are supported in CUDA __device__ code:
-        thrust::seq
-        thrust::cuda::par
-        thrust::device, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-
-      Parallel algorithm execution may not be accelerated unless CUDA Dynamic Parallelism is available.
-
-    Execution Policies
-      CUDA Streams
-        The thrust::cuda::par.on(stream) syntax allows users to request that CUDA __global__ functions launched during algorithm 
-        execution should occur on a given stream:
-
-        // execute for_each on stream s
-        thrust::for_each(thrust::cuda::par.on(s), begin, end, my_functor);
-
-        Algorithms executed with a CUDA stream in this manner may still synchronize with other streams when allocating temporary
-        storage or returning results to the CPU.
-
-      thrust::seq
-        The thrust::seq execution policy allows users to require that an algorithm execute sequentially in the calling thread:
-
-        // execute for_each sequentially in this thread
-        thrust::for_each(thrust::seq, begin, end, my_functor);
-        
-    Other
-      The new thrust::complex template provides complex number support.
-
-New Examples
-    simple_cuda_streams demonstrates how to request a CUDA stream during algorithm execution.
-    async_reduce demonstrates ways to achieve algorithm invocations which are asynchronous with the calling thread.
-
-Other Enhancements
-    CUDA sort performance for user-defined types is 300% faster on Tesla K20c for large problem sizes.
-    CUDA merge performance is 200% faster on Tesla K20c for large problem sizes.
-    CUDA sort performance for primitive types is 50% faster on Tesla K20c for large problem sizes.
-    CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem sizes.
-    CUDA scan performance is 15% faster on Tesla K20c for large problem sizes.
-    fallback_allocator example is simpler.
-
-Bug Fixes
-    #364 iterators with unrelated system tags may be used with algorithms invoked with an execution policy
-    #371 do not redefine __CUDA_ARCH__
-    #379 fix crash when dereferencing transform_iterator on the CPU
-    #391 avoid use of uppercase variable names
-    #392 fix thrust::copy between cusp::complex & std::complex
-    #396 program compiled with gcc < 4.3 hangs during comparison sort
-    #406 fallback_allocator.cu example checks device for unified addressing support
-    #417 avoid using std::less<T> in binary search algorithms
-    #418 avoid various warnings
-    #443 including version.h no longer configures default systems
-    #578 nvcc produces warnings when sequential algorithms are used with cpu systems
-
-Known Issues
-    When invoked with primitive data types, thrust::sort, thrust::sort_by_key, thrust::stable_sort, & thrust::stable_sort_by_key may
-    fail to link in some cases with nvcc -rdc=true.
-
-    The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last element in a segment of equivalent keys instead of the first.
-
-Acknowledgments
-    Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan implementations.
-    Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation.
-    Thanks to Filipe Maia for contributing the implementation of thrust::complex.
-
-#######################################
-#           Thrust v1.7.2             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Avoid use of std::min in generic find implementation
-
-#######################################
-#           Thrust v1.7.1             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Eliminate identifiers in set_operations.cu example with leading underscore
-    Eliminate unused variable warning in CUDA reduce_by_key implemention
-    Avoid deriving function objects from std::unary_function and std::binary_function
-
-#######################################
-#           Thrust v1.7.0             #
-#######################################
-
-Summary
-    Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
-    well as several new algorithms and performance improvements. With this new
-    interface, users may directly control how algorithms execute as well as details
-    such as the allocation of temporary storage. Key/value versions of thrust::merge
-    and the set operation algorithms have been added, as well stencil versions of
-    partitioning algorithms. thrust::tabulate has been introduced to tabulate the
-    values of functions taking integers. For 32b types, new CUDA merge and set
-    operations provide 2-15x faster performance while a new CUDA comparison sort
-    provides 1.3-4x faster performance. Finally, a new TBB reduce_by_key implementation
-    provides 80% faster performance.
-
-Breaking API Changes
-    Dispatch
-      Custom user backend systems' tag types must now inherit from the corresponding system's execution_policy template (e.g. thrust::cuda::execution_policy) instead
-      of the tag struct (e.g. thrust::cuda::tag). Otherwise, algorithm specializations will silently go unfound during dispatch.
-      See examples/minimal_custom_backend.cu and examples/cuda/fallback_allocator.cu for usage examples.
-
-      thrust::advance and thrust::distance are no longer dispatched based on iterator system type and thus may no longer be customized.
-
-    Iterators
-      iterator_facade and iterator_adaptor's Pointer template parameters have been eliminated.
-      iterator_adaptor has been moved into the thrust namespace (previously thrust::experimental::iterator_adaptor).
-      iterator_facade has been moved into the thrust namespace (previously thrust::experimental::iterator_facade).
-      iterator_core_access has been moved into the thrust namespace (previously thrust::experimental::iterator_core_access).
-      All iterators' nested pointer typedef (the type of the result of operator->) is now void instead of a pointer type to indicate that such expressions are currently impossible.
-      Floating point counting_iterators' nested difference_type typedef is now a signed integral type instead of a floating point type.
-
-    Other
-      normal_distribution has been moved into the thrust::random namespace (previously thrust::random::experimental::normal_distribution).
-      Placeholder expressions may no longer include the comma operator.
-
-New Features
-    Execution Policies
-      Users may directly control the dispatch of algorithm invocations with optional execution policy arguments.
-      For example, instead of wrapping raw pointers allocated by cudaMalloc with thrust::device_ptr, the thrust::device execution_policy may be passed as an argument to an algorithm invocation to enable CUDA execution.
-      The following execution policies are supported in this version:
-
-        thrust::host
-        thrust::device
-        thrust::cpp::par
-        thrust::cuda::par
-        thrust::omp::par
-        thrust::tbb::par
-
-    Algorithms
-	free
-	get_temporary_buffer
-	malloc
-        merge_by_key
-        partition with stencil
-        partition_copy with stencil
-	return_temporary_buffer
-        set_difference_by_key
-        set_intersection_by_key
-        set_symmetric_difference_by_key
-        set_union_by_key
-        stable_partition with stencil
-        stable_partition_copy with stencil
-	tabulate
-
-New Examples
-    uninitialized_vector demonstrates how to use a custom allocator to avoid the automatic initialization of elements in thrust::device_vector.
-
-Other Enhancements
-    Authors of custom backend systems may manipulate arbitrary state during algorithm dispatch by incorporating it into their execution_policy parameter.
-    Users may control the allocation of temporary storage during algorithm execution by passing standard allocators as parameters via execution policies such as thrust::device.
-    THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the device backend. 
-    CUDA merge performance is 2-15x faster.
-    CUDA comparison sort performance is 1.3-4x faster.
-    CUDA set operation performance is 1.5-15x faster.
-    TBB reduce_by_key performance is 80% faster.
-    Several algorithms have been parallelized with TBB.
-    Support for user allocators in vectors has been improved.
-    The sparse_vector example is now implemented with merge_by_key instead of sort_by_key.
-    Warnings have been eliminated in various contexts.
-    Warnings about __host__ or __device__-only functions called from __host__ __device__ functions have been eliminated in various contexts.
-    Documentation about algorithm requirements have been improved.
-    Simplified the minimal_custom_backend example.
-    Simplified the cuda/custom_temporary_allocation example.
-    Simplified the cuda/fallback_allocator example.
-
-Bug Fixes
-    #248 fix broken counting_iterator<float> behavior with OpenMP
-    #231, #209 fix set operation failures with CUDA
-    #187 fix incorrect occupancy calculation with CUDA
-    #153 fix broken multigpu behavior with CUDA
-    #142 eliminate warning produced by thrust::random::taus88 and MSVC 2010
-    #208 correctly initialize elements in temporary storage when necessary
-    #16 fix compilation error when sorting bool with CUDA
-    #10 fix ambiguous overloads of reinterpret_tag
-
-Known Issues
-    g++ versions 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly causing infinite recursion in examples such as cuda/custom_temporary_allocation.
-
-Acknowledgments
-    Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing a faster merge implementation for CUDA.
-    Thanks to Sean Baxter for contributing a faster set operation implementation for CUDA.
-    Thanks to Cliff Woolley for contributing a correct occupancy calculation algorithm.
-
-#######################################
-#           Thrust v1.6.0             #
-#######################################
-
-Summary
-    Thrust v1.6.0 provides an interface for customization and extension and a new
-    backend system based on the Threading Building Blocks library. With this
-    new interface, programmers may customize the behavior of specific algorithms
-    as well as control the allocation of temporary storage or invent entirely new
-    backends. These enhancements also allow multiple different backend systems
-    such as CUDA and OpenMP to coexist within a single program. Support for TBB
-    allows Thrust programs to integrate more naturally into applications which
-    may already employ the TBB task scheduler.
-
-Breaking API Changes
-    The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to <thrust/system/cuda/experimental/pinned_allocator.h>
-    thrust::experimental::cuda::pinned_allocator has been moved to thrust::cuda::experimental::pinned_allocator
-    The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
-    The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
-    The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
-    thrust::host_space_tag has been renamed thrust::host_system_tag
-    thrust::device_space_tag has been renamed thrust::device_system_tag
-    thrust::any_space_tag has been renamed thrust::any_system_tag
-    thrust::iterator_space has been renamed thrust::iterator_system
-    
-
-New Features
-    Backend Systems
-        Threading Building Blocks (TBB) is now supported
-    Functions
-        for_each_n
-        raw_reference_cast
-    Types
-        pointer
-        reference
-
-New Examples
-    cuda/custom_temporary_allocation
-    cuda/fallback_allocator
-    device_ptr
-    expand
-    minimal_custom_backend
-    raw_reference_cast
-    set_operations
-
-Other Enhancements
-    thrust::for_each now returns the end of the input range similar to most other algorithms
-    thrust::pair and thrust::tuple have swap functionality
-    all CUDA algorithms now support large data types
-    iterators may be dereferenced in user __device__ or __global__ functions
-    the safe use of different backend systems is now possible within a single binary
-
-Bug Fixes
-    #469 min_element and max_element algorithms no longer require a const comparison operator
-
-Known Issues
-    cudafe++.exe may crash when parsing TBB headers on Windows. 
-
-#######################################
-#           Thrust v1.5.3             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Avoid warnings about potential race due to __shared__ non-POD variable
-
-#######################################
-#           Thrust v1.5.2             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Fixed warning about C-style initialization of structures
-
-#######################################
-#           Thrust v1.5.1             #
-#######################################
-
-Summary
-    Small bug fixes
-
-Bug Fixes
-    Sorting data referenced by permutation_iterators on CUDA produces invalid results
-
-#######################################
-#           Thrust v1.5.0             #
-#######################################
-
-Summary
-    Thrust v1.5.0 provides introduces new programmer productivity and performance
-    enhancements. New functionality for creating anonymous "lambda" functions has
-    been added. A faster host sort provides 2-10x faster performance for sorting
-    arithmetic types on (single-threaded) CPUs. A new OpenMP sort provides
-    2.5x-3.0x speedup over the host sort using a quad-core CPU. When sorting
-    arithmetic types with the OpenMP backend the combined performance improvement
-    is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to 14.2x
-    (8-bit types). A new CUDA reduce_by_key implementation provides 2-3x faster
-    performance.
-
-Breaking API Changes
-    device_ptr<void> no longer unsafely converts to device_ptr<T> without an
-    explicit cast. Use the expression
-    device_pointer_cast(static_cast<int*>(void_ptr.get()))
-    to convert, for example, device_ptr<void> to device_ptr<int>.
-
-New Features
-    Functions
-        stencil-less transform_if
-
-    Types
-        lambda placeholders
-
-New Examples
-    lambda
-
-Other Enhancements
-    host sort is 2-10x faster for arithmetic types
-    OMP sort provides speedup over host sort
-    reduce_by_key is 2-3x faster
-    reduce_by_key no longer requires O(N) temporary storage
-    CUDA scan algorithms are 10-40% faster
-    host_vector and device_vector are now documented
-    out-of-memory exceptions now provide detailed information from CUDART
-    improved histogram example
-    device_reference now has a specialized swap
-    reduce_by_key and scan algorithms are compatible with discard_iterator
-
-Removed Functionality
-
-Bug Fixes
-     #44 allow host_vector to compile when value_type uses __align__
-    #198 allow adjacent_difference to permit safe in-situ operation
-    #303 make thrust thread-safe
-    #313 avoid race conditions in device_vector::insert
-    #314 avoid unintended adl invocation when dispatching copy
-    #365 fix merge and set operation failures
-
-Known Issues
-    None
-
-Acknowledgments
-    Thanks to Manjunath Kudlur for contributing his Carbon library, from which the lambda functionality is derived.
-    Thanks to Jean-Francois Bastien for suggesting a fix for issue 303.
-
-#######################################
-#           Thrust v1.4.0             #
-#######################################
-
-Summary
-    Thrust v1.4.0 provides support for CUDA 4.0 in addition to many feature
-    and performance improvements.  New set theoretic algorithms operating on
-    sorted sequences have been added.  Additionally, a new fancy iterator
-    allows discarding redundant or otherwise unnecessary output from
-    algorithms, conserving memory storage and bandwidth.
-
-Breaking API Changes
-    Eliminations
-        thrust/is_sorted.h
-        thrust/utility.h
-        thrust/set_intersection.h
-        thrust/experimental/cuda/ogl_interop_allocator.h and the functionality therein
-        thrust::deprecated::copy_when
-        thrust::deprecated::absolute_value
-
-New Features
-    Functions
-        copy_n
-        merge
-        set_difference
-        set_symmetric_difference
-        set_union
-
-    Types
-        discard_iterator
-
-    Device support
-        Compute Capability 2.1 GPUs
-
-New Examples
-    run_length_decoding
-
-Other Enhancements
-    Compilation warnings are substantially reduced in various contexts.
-    The compilation time of thrust::sort, thrust::stable_sort, thrust::sort_by_key,
-    and thrust::stable_sort_by_key are substantially reduced.
-    A fast sort implementation is used when sorting primitive types with thrust::greater.
-    The performance of thrust::set_intersection is improved.
-    The performance of thrust::fill is improved on SM 1.x devices.
-    A code example is now provided in each algorithm's documentation.
-    thrust::reverse now operates in-place
-
-Removed Functionality
-    thrust::deprecated::copy_when
-    thrust::deprecated::absolute_value
-    thrust::experimental::cuda::ogl_interop_allocator
-    thrust::gather and thrust::scatter from host to device and vice versa are no longer supported.
-    Operations which modify the elements of a thrust::device_vector are no longer
-    available from source code compiled without nvcc when the device backend is CUDA.
-    Instead, use the idiom from the cpp_interop example.
-
-Bug Fixes
-    #212 set_intersection works correctly for large input sizes.
-    #275 counting_iterator and constant_iterator work correctly with OpenMP as the
-    backend when compiling with optimization
-    #256 min and max correctly return their first argument as a tie-breaker
-    #248 NDEBUG is interpreted correctly
-
-Known Issues
-    nvcc may generate code containing warnings when compiling some Thrust algorithms.
-    When compiling with -arch=sm_1x, some Thrust algorithms may cause nvcc to issue
-    benign pointer advisories.
-    When compiling with -arch=sm_1x and -G, some Thrust algorithms may fail to execute correctly.
-    thrust::inclusive_scan, thrust::exclusive_scan, thrust::inclusive_scan_by_key,
-    and thrust::exclusive_scan_by_key are currently incompatible with thrust::discard_iterator.
-
-Acknowledgments
-    Thanks to David Tarjan for improving the performance of set_intersection.
-    Thanks to Duane Merrill for continued help with sort.
-    Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
-
-#######################################
-#           Thrust v1.3.0             #
-#######################################
-
-Summary
-    Thrust v1.3.0 provides support for CUDA 3.2 in addition to many feature
-    and performance enhancements.
-    
-    Performance of the sort and sort_by_key algorithms is improved by as much 
-    as 3x in certain situations.  The performance of stream compaction algorithms,
-    such as copy_if, is improved by as much as 2x.  Reduction performance is 
-    also improved, particularly for small input sizes.
-    
-    CUDA errors are now converted to runtime exceptions using the system_error
-    interface.  Combined with a debug mode, also new in v1.3, runtime errors
-    can be located with greater precision.
-
-    Lastly, a few header files have been consolidated or renamed for clarity.
-    See the deprecations section below for additional details.
-
-
-Breaking API Changes
-    Promotions
-        thrust::experimental::inclusive_segmented_scan has been renamed thrust::inclusive_scan_by_key and exposes a different interface
-        thrust::experimental::exclusive_segmented_scan has been renamed thrust::exclusive_scan_by_key and exposes a different interface
-        thrust::experimental::partition_copy has been renamed thrust::partition_copy and exposes a different interface
-        thrust::next::gather has been renamed thrust::gather
-        thrust::next::gather_if has been renamed thrust::gather_if
-        thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
-    Deprecations
-        thrust::copy_when has been renamed thrust::deprecated::copy_when
-        thrust::absolute_value has been renamed thrust::deprecated::absolute_value
-        The header thrust/set_intersection.h is now deprecated; use thrust/set_operations.h instead
-        The header thrust/utility.h is now deprecated; use thrust/swap.h instead
-        The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
-    Eliminations
-        thrust::deprecated::gather
-        thrust::deprecated::gather_if
-        thrust/experimental/arch.h and the functions therein
-        thrust/sorting/merge_sort.h
-        thrust/sorting/radix_sort.h
-
-New Features
-    Functions
-        exclusive_scan_by_key
-        find
-        find_if
-        find_if_not
-        inclusive_scan_by_key
-        is_partitioned
-        is_sorted_until
-        mismatch
-        partition_point
-        reverse
-        reverse_copy
-        stable_partition_copy
-
-    Types
-        system_error and related types
-        experimental::cuda::ogl_interop_allocator
-        bit_and, bit_or, and bit_xor
-
-    Device support
-        gf104-based GPUs
-
-New Examples
-    opengl_interop.cu
-    repeated_range.cu
-    simple_moving_average.cu
-    sparse_vector.cu
-    strided_range.cu
-
-Other Enhancements
-    Performance of thrust::sort and thrust::sort_by_key is substantially improved for primitive key types
-    Performance of thrust::copy_if is substantially improved
-    Performance of thrust::reduce and related reductions is improved
-    THRUST_DEBUG mode added
-    Callers of Thrust functions may detect error conditions by catching thrust::system_error, which derives from std::runtime_error
-    The number of compiler warnings generated by Thrust has been substantially reduced
-    Comparison sort now works correctly for input sizes > 32M
-    min & max usage no longer collides with <windows.h> definitions
-    Compiling against the OpenMP backend no longer requires nvcc
-    Performance of device_vector initialized in .cpp files is substantially improved in common cases
-    Performance of thrust::sort_by_key on the host is substantially improved
-
-Removed Functionality
-    nvcc 2.3 is no longer supported
-
-Bug Fixes
-    Debug device code now compiles correctly
-    thrust::uninitialized_copy and thrust::unintialized_fill now dispatch constructors on the device rather than the host
-
-Known Issues
-    #212 set_intersection is known to fail for large input sizes
-    partition_point is known to fail for 64b types with nvcc 3.2
-
-Acknowledgments
-    Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
-    Thanks to Erich Elsen for contributing an implementation of find_if
-    Thanks to Andrew Corrigan for contributing changes which allow the OpenMP backend to compile in the absence of nvcc
-    Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for bug reports
-    Thanks to Cliff Woolley for help with testing
-
-#######################################
-#           Thrust v1.2.1             #
-#######################################
-
-Summary
-    Small fixes for compatibility with CUDA 3.1
-
-Known Issues
-    inclusive_scan & exclusive_scan may fail with very large types
-    the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-    uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-    # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-    default_random_engine::discard is not accelerated with nvcc 2.3
-    nvcc 3.1 may fail to compile code using types derived from thrust::subtract_with_carry_engine, such as thrust::ranlux24 & thrust::ranlux48.
-
-#######################################
-#           Thrust v1.2.0             #
-#######################################
-
-Summary
-    Thrust v1.2 introduces support for compilation to multicore CPUs
-    and the Ocelot virtual machine, and several new facilities for
-    pseudo-random number generation.  New algorithms such as set
-    intersection and segmented reduction have also been added.  Lastly,
-    improvements to the robustness of the CUDA backend ensure
-    correctness across a broad set of (uncommon) use cases.
-
-Breaking API Changes
-    thrust::gather's interface was incorrect and has been removed.
-    The old interface is deprecated but will be preserved for Thrust
-    version 1.2 at thrust::deprecated::gather &
-    thrust::deprecated::gather_if. The new interface is provided at
-    thrust::next::gather & thrust::next::gather_if.  The new interface
-    will be promoted to thrust:: in Thrust version 1.3. For more details,
-    please refer to this thread:
-    http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd
-
-    The thrust::sorting namespace has been deprecated in favor of the
-    top-level sorting functions, such as thrust::sort() and
-    thrust::sort_by_key().
-
-New Features
-    Functions
-        reduce_by_key
-        set_intersection
-        tie
-        unique_copy
-        unique_by_key
-        unique_copy_by_key
-
-    Types
-        Random Number Generation
-            discard_block_engine
-            default_random_engine
-            linear_congruential_engine
-            linear_feedback_shift_engine
-            minstd_rand
-            minstd_rand0
-            normal_distribution (experimental)
-            ranlux24
-            ranlux48
-            ranlux24_base
-            ranlux48_base
-            subtract_with_carry_engine
-            taus88
-            uniform_int_distribution
-            uniform_real_distribution
-            xor_combine_engine
-        Functionals
-            project1st
-            project2nd
-
-    Fancy Iterators
-        permutation_iterator
-        reverse_iterator
-
-    Device support
-        Add support for multicore CPUs via OpenMP
-        Add support for Fermi-class GPUs
-        Add support for Ocelot virtual machine
-
-New Examples
-    cpp_integration
-    histogram
-    mode
-    monte_carlo
-    monte_carlo_disjoint_sequences
-    padded_grid_reduction
-    permutation_iterator
-    row_sum
-    run_length_encoding
-    segmented_scan
-    stream_compaction
-    summary_statistics
-    transform_iterator
-    word_count
-
-Other Enhancements
-    vector functions operator!=, rbegin, crbegin, rend, crend, data, & shrink_to_fit
-    integer sorting performance is improved when max is large but (max - min) is small and when min is negative
-    performance of inclusive_scan() and exclusive_scan() is improved by 20-25% for primitive types
-    support for nvcc 3.0
-
-Removed Functionality
-    removed support for equal between host & device sequences
-    removed support for gather() and scatter() between host & device sequences
-
-Bug Fixes
-    # 8 cause a compiler error if the required compiler is not found rather than a mysterious error at link time
-    # 42 device_ptr & device_reference are classes rather than structs, eliminating warnings on certain platforms
-    # 46 gather & scatter handle any space iterators correctly
-    # 51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
-    # 52 avoid collisions with common user macros such as BLOCK_SIZE
-    # 62 provide better documentation for device_reference
-    # 68 allow built-in CUDA vector types to work with device_vector in pure C++ mode
-    # 102 eliminated a race condition in device_vector::erase
-    various compilation warnings eliminated
-
-Known Issues
-   inclusive_scan & exclusive_scan may fail with very large types
-   the Microsoft compiler may fail to compile code using both sort and binary search algorithms
-   uninitialized_fill & uninitialized_copy dispatch constructors on the host rather than the device
-   # 109 some algorithms may exhibit poor performance with the OpenMP backend with large numbers (>= 6) of CPU threads
-   default_random_engine::discard is not accelerated with nvcc 2.3
-
-Acknowledgments
-   Thanks to Gregory Diamos for contributing a CUDA implementation of set_intersection
-   Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit tests and examples against Ocelot
-   Thanks to Tom Bradley for contributing an implementation of normal_distribution
-   Thanks to Joseph Rhoads for contributing the example summary_statistics
-
-#######################################
-#           Thrust v1.1.1             #
-#######################################
-
-Summary
-    Small fixes for compatibility with CUDA 2.3a and Mac OSX Snow Leopard.
-
-#######################################
-#           Thrust v1.1.0             #
-#######################################
-
-Summary
-    Thrust v1.1 introduces fancy iterators, binary search functions, and
-    several specialized reduction functions.  Experimental support for
-    segmented scan has also been added.
-
-Breaking API Changes
-    counting_iterator has been moved into the thrust namespace (previously thrust::experimental)
-
-New Features
-    Functions
-        copy_if
-        lower_bound
-        upper_bound
-        vectorized lower_bound
-        vectorized upper_bound
-        equal_range
-        binary_search
-        vectorized binary_search
-        all_of
-        any_of
-        none_of
-        minmax_element
-        advance
-        inclusive_segmented_scan (experimental)
-        exclusive_segmented_scan (experimental)
-
-    Types
-        pair
-        tuple
-        device_malloc_allocator
-
-    Fancy Iterators
-        constant_iterator
-        counting_iterator
-        transform_iterator
-        zip_iterator
-
-New Examples
-    computing the maximum absolute difference between vectors
-    computing the bounding box of a two-dimensional point set
-    sorting multiple arrays together (lexicographical sorting)
-    constructing a summed area table
-    using zip_iterator to mimic an array of structs
-    using constant_iterator to increment array values
-
-Other Enhancements
-    added pinned memory allocator (experimental)
-    added more methods to host_vector & device_vector (issue #4)
-    added variant of remove_if with a stencil argument (issue #29)
-    scan and reduce use cudaFuncGetAttributes to determine grid size
-    exceptions are reported when temporary device arrays cannot be allocated 
-
-Bug Fixes
-     #5 make vector work for larger data types
-     #9 stable_partition_copy doesn't respect OutputIterator concept semantics
-    #10 scans should return OutputIterator
-    #16 make algorithms work for larger data types
-    #27 dispatch radix_sort even when comp=less<T> is explicitly provided
-
-Known Issues
-    Using functors with Thrust entry points may not compile on Mac OSX with gcc-4.0.1
-    uninitialized_copy & uninitialized_fill dispatch constructors on the host rather than the device.
-    inclusive_scan, inclusive_scan_by_key, exclusive_scan, and exclusive_scan_by_key may fail when used with large types with the CUDA 3.1 driver
-
-
-#######################################
-#           Thrust v1.0.0             #
-#######################################
-
-Breaking API changes
-    Rename top level namespace komrade to thrust.
-    Move partition_copy() & stable_partition_copy() into thrust::experimental namespace until we can easily provide the standard interface.
-    Rename range() to sequence() to avoid collision with Boost.Range.
-    Rename copy_if() to copy_when() due to semantic differences with C++0x copy_if().
-
-New Features
-    Add C++0x style cbegin() & cend() methods to host_vector & device_vector.
-    Add transform_if function.
-    Add stencil versions of replace_if() & replace_copy_if().
-    Allow counting_iterator to work with for_each().
-    Allow types with constructors in comparison sort & reduce.
-
-Other Enhancements
-    merge_sort and stable_merge_sort are now 2 to 5x faster when executed on the parallel device.
-
-Bug fixes
-    Workaround an issue where an incremented iterator causes nvcc to crash. (Komrade issue #6)
-    Fix an issue where const_iterators could not be passed to transform. (Komrade issue #7)
-
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 000000000..57eff4212
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,2315 @@
+# Changelog
+
+## Thrust 2.1.0
+
+### New Features
+
+- NVIDIA/thrust#1805: Add default constructors to `transform_output_iterator`
+  and `transform_input_output_iterator`. Thanks to Mark Harris (@harrism) for this contribution.
+- NVIDIA/thrust#1836: Enable constructions of vectors from `std::initializer_list`.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1768: Fix type conversion warning in the `thrust::complex` utilities. Thanks to
+  Zishi Wu (@zishiwu123) for this contribution.
+- NVIDIA/thrust#1809: Fix some warnings about usage of `__host__` functions in `__device__` code.
+- NVIDIA/thrust#1825: Fix Thrust's CMake install rules. Thanks to Robert Maynard (@robertmaynard)
+  for this contribution.
+- NVIDIA/thrust#1827: Fix `thrust::reduce_by_key` when using non-default-initializable iterators.
+- NVIDIA/thrust#1832: Fix bug in device-side CDP `thrust::reduce` when using a large number of
+  inputs.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1815: Update Thrust's libcu++ git submodule to version 1.8.1.
+- NVIDIA/thrust#1841: Fix invalid code in execution policy documentation example. Thanks to Raphaël
+  Frantz (@Eren121) for this contribution.
+- NVIDIA/thrust#1848: Improve error messages when attempting to launch a kernel on a device that is
+  not supported by compiled PTX versions. Thanks to Zahra Khatami (@zkhatami) for this contribution.
+- NVIDIA/thrust#1855: Remove usage of deprecated CUDA error codes.
+
+## Thrust 2.0.1
+
+### Other Enhancements
+
+- Disable CDP parallelization of device-side invocations of Thrust algorithms on SM90+. The removal
+  of device-side synchronization support in recent architectures makes Thrust's fork-join model
+  unimplementable on device, so a serial implementation will be used instead. Host-side invocations
+  of Thrust algorithms are not affected.
+
+## Thrust 2.0.0
+
+### Summary
+
+The Thrust 2.0.0 major release adds a dependency on libcu++ and contains several
+breaking changes. These include new diagnostics when inspecting device-only
+lambdas from the host, removal of the `cub` symlink in the Thrust repository
+root, and removal of the deprecated `THRUST_*_BACKEND` macros. It also includes
+several minor bugfixes and cleanups.
+
+### Breaking Changes
+
+- NVIDIA/thrust#1605: Add libcu++ dependency.
+    - A suitable version of libcu++ is provided through
+      the `${THRUST_ROOT}/dependencies/libcudacxx/` submodule.
+    - Non-cmake users may need to add the libcu++ include path to their
+      builds (`-I ${THRUST_ROOT}/dependencies/libcudacxx/include/`).
+    - The Thrust CMake packages have been updated to add this include path.
+- NVIDIA/thrust#1605: The following macros are no longer defined by default.
+  They can be re-enabled by defining `THRUST_PROVIDE_LEGACY_ARCH_MACROS`. These
+  will be removed completely in a future release.
+    - `THRUST_IS_HOST_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_IS_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_INCLUDE_HOST_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_INCLUDE_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+    - `THRUST_DEVICE_CODE`: Replace with `NV_IF_TARGET`.
+- NVIDIA/thrust#1661: Thrust's CUDA Runtime support macros have been updated to
+  support `NV_IF_TARGET`. They are now defined consistently across all
+  host/device compilation passes. This should not affect most usages of these
+  macros, but may require changes for some edge cases.
+    - `THRUST_RUNTIME_FUNCTION`: Execution space annotations for functions that
+      invoke CUDA Runtime APIs.
+        - Old behavior:
+            - RDC enabled: Defined to `__host__ __device__`
+            - RDC not enabled:
+                - NVCC host pass: Defined to `__host__ __device__`
+                - NVCC device pass: Defined to `__host__`
+        - New behavior:
+            - RDC enabled: Defined to `__host__ __device__`
+            - RDC not enabled: Defined to `__host__`
+    - `__THRUST_HAS_CUDART__`: No change in behavior, but no longer used in
+      Thrust. Provided for legacy support only. Legacy behavior:
+        - RDC enabled: Defined to 1.
+        - RDC not enabled:
+            - NVCC host pass: Defined to 1.
+            - NVCC device pass: Defined to 0.
+    - `THRUST_RDC_ENABLED`: New macro, may be combined with `NV_IF_TARGET` to
+      replace most usages of `__THRUST_HAS_CUDART__`. Behavior:
+        - RDC enabled: Macro is defined.
+        - RDC not enabled: Macro is not defined.
+- NVIDIA/thrust#1701: Remove the `cub` symlink from the root of the Thrust
+  repository.
+    - This symlink caused issues in certain build environments (e.g.
+      NVIDIA/thrust#1328).
+    - Builds that relied on this symlink will need to add the full CUB include
+      path (`-I ${THRUST_ROOT}/dependencies/cub`).
+    - CMake builds that use the Thrust packages via CPM, `add_subdirectory`,
+      or `find_package` are not affected.
+- NVIDIA/thrust#1760: A compile-time error is now emitted when a `__device__`
+  -only lambda's return type is queried from host code (requires libcu++ ≥
+  1.9.0).
+    - Due to limitations in the CUDA programming model, the result of this query
+      is unreliable, and will silently return an incorrect result. This leads to
+      difficult to debug errors.
+    - When using libcu++ 1.9.0, an error will be emitted with information about
+      work-arounds:
+        - Use a named function object with a `__device__`-only implementation
+          of `operator()`.
+        - Use a `__host__ __device__` lambda.
+        - Use `cuda::proclaim_return_type` (Added in libcu++ 1.9.0)
+- NVIDIA/thrust#1761: Removed support for deprecated `THRUST_DEVICE_BACKEND`
+  and `THRUST_HOST_BACKEND` macros. The `THRUST_DEVICE_SYSTEM`
+  and `THRUST_HOST_SYSTEM` macros should be used instead.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1605: Fix some execution space warnings in the allocator
+  library.
+- NVIDIA/thrust#1683: Fix bug in `iterator_category_to_traversal` metafunctions.
+- NVIDIA/thrust#1715: Add missing `__thrust_exec_check_disable__` annotation
+  to `thrust::make_zip_function`. Thanks to @mfbalin for this contribution.
+- NVIDIA/thrust#1722: Remove CUDA-specific error handler from code that may be
+  executed on non-CUDA backends. Thanks to @dkolsen-pgi for this contribution.
+- NVIDIA/thrust#1756: Fix `copy_if` for output iterators that don't support copy
+  assignment. Thanks for @mfbalin for this contribution.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1605: Removed special case code for unsupported CUDA
+  architectures.
+- NVIDIA/thrust#1605: Replace several usages of `__CUDA_ARCH__`
+  with `<nv/target>` to handle host/device code divergence.
+- NVIDIA/thrust#1752: Remove a leftover merge conflict from a documentation
+  file. Thanks to @tabedzki for this contribution.
+
+## Thrust 1.17.2
+
+### Summary
+
+Thrust 1.17.2 is a minor bugfix release that provides an updated version of CUB.
+
+## Thrust 1.17.1
+
+### Summary
+
+Thrust 1.17.1 is a minor bugfix release that provides an updated version of CUB.
+
+## Thrust 1.17.0
+
+### Summary
+
+Thrust 1.17.0 is the final minor release of the 1.X series. This release
+provides GDB pretty-printers for device vectors/references, a new `unique_count`
+algorithm, and an easier way to create tagged Thrust iterators. Several
+documentation fixes are included, which can be found on the new Thrust
+documentation site at https://nvidia.github.io/thrust. We'll be migrating
+existing documentation sources to this new location over the next few months.
+
+### New Features
+
+- NVIDIA/thrust#1586: Add new `thrust::make_tagged_iterator` convenience
+  function. Thanks to @karthikeyann for this contribution.
+- NVIDIA/thrust#1619: Add `unique_count` algorithm. Thanks to @upsj for this
+  contribution.
+- NVIDIA/thrust#1631: Add GDB pretty-printers for device vectors/references
+  to `scripts/gdb-pretty-printers.py`. Thanks to @upsj for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1671: Fixed `reduce_by_key` when called with 2^31 elements.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1512: Use CUB to implement `adjacent_difference`.
+- NVIDIA/thrust#1555: Use CUB to implement `scan_by_key`.
+- NVIDIA/thrust#1611: Add new doxybook-based Thrust documentation
+  at https://nvidia.github.io/thrust.
+- NVIDIA/thrust#1639: Fixed broken link in documentation. Thanks to @jrhemstad
+  for this contribution.
+- NVIDIA/thrust#1644: Increase contrast of search input text in new doc site.
+  Thanks to @bdice for this contribution.
+- NVIDIA/thrust#1647: Add `__forceinline__` annotations to a functor wrapper.
+  Thanks to @mkuron for this contribution.
+- NVIDIA/thrust#1660: Fixed typo in documentation example for
+  `permutation_iterator`.
+- NVIDIA/thrust#1669: Add a new `explicit_cuda_stream.cu` example that shows how
+  to use explicit CUDA streams and `par`/`par_nosync` execution policies.
+
+## Thrust 1.16.0
+
+### Summary
+
+Thrust 1.16.0 provides a new “nosync” hint for the CUDA backend, as well as
+numerous bugfixes and stability improvements.
+
+#### New `thrust::cuda::par_nosync` Execution Policy
+
+Most of Thrust's parallel algorithms are fully synchronous and will block the
+calling CPU thread until all work is completed. This design avoids many pitfalls
+associated with asynchronous GPU programming, resulting in simpler and
+less-error prone usage for new CUDA developers. Unfortunately, this improvement
+in user experience comes at a performance cost that often frustrates more
+experienced CUDA programmers.
+
+Prior to this release, the only synchronous-to-asynchronous migration path for
+existing Thrust codebases involved significant refactoring, replacing calls
+to `thrust` algorithms with a limited set of `future`-based `thrust::async`
+algorithms or lower-level CUB kernels. The new `thrust::cuda::par_nosync`
+execution policy provides a new, less-invasive entry point for asynchronous
+computation.
+
+`par_nosync` is a hint to the Thrust execution engine that any non-essential
+internal synchronizations should be skipped and that an explicit synchronization
+will be performed by the caller before accessing results.
+
+While some Thrust algorithms require internal synchronization to safely compute
+their results, many do not. For example, multiple `thrust::for_each` invocations
+can be launched without waiting for earlier calls to complete:
+
+```cpp
+// Queue three `for_each` kernels:
+thrust::for_each(thrust::cuda::par_nosync, vec1.begin(), vec1.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec2.begin(), vec2.end(), Op{});
+thrust::for_each(thrust::cuda::par_nosync, vec3.begin(), vec3.end(), Op{});
+
+// Do other work while kernels execute:
+do_something();
+
+// Must explictly synchronize before accessing `for_each` results:
+cudaDeviceSynchronize();
+```
+
+Thanks to @fkallen for this contribution.
+
+### Deprecation Notices
+
+#### CUDA Dynamic Parallelism Support
+
+**A future version of Thrust will remove support for CUDA Dynamic Parallelism
+(CDP).**
+
+This will only affect calls to Thrust algorithms made from CUDA device-side code
+that currently launches a kernel; such calls will instead execute sequentially
+on the calling GPU thread instead of launching a device-wide kernel.
+
+### Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+- NVIDIA/thrust#1572: Removed several unnecessary header includes. Downstream
+  projects may need to update their includes if they were relying on this
+  behavior.
+
+### New Features
+
+- NVIDIA/thrust#1568: Add `thrust::cuda::par_nosync` policy. Thanks to @fkallen
+  for this contribution.
+
+### Enhancements
+
+- NVIDIA/thrust#1511: Use CUB's new `DeviceMergeSort` API and remove Thrust's
+  internal implementation.
+- NVIDIA/thrust#1566: Improved performance of `thrust::shuffle`. Thanks to
+  @djns99 for this contribution.
+- NVIDIA/thrust#1584: Support user-defined `CMAKE_INSTALL_INCLUDEDIR` values in
+  Thrust's CMake install rules. Thanks to @robertmaynard for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1496: Fix some issues affecting `icc` builds.
+- NVIDIA/thrust#1552: Fix some collisions with the `min`/`max`  macros defined
+  in `windows.h`.
+- NVIDIA/thrust#1582: Fix issue with function type alias on 32-bit MSVC builds.
+- NVIDIA/thrust#1591: Workaround issue affecting compilation with `nvc++`.
+- NVIDIA/thrust#1597: Fix some collisions with the `small` macro defined
+  in `windows.h`.
+- NVIDIA/thrust#1599, NVIDIA/thrust#1603: Fix some issues with version handling
+  in Thrust's CMake packages.
+- NVIDIA/thrust#1614: Clarify that scan algorithm results are non-deterministic
+  for pseudo-associative operators (e.g. floating-point addition).
+
+## Thrust 1.15.0
+
+### Summary
+
+Thrust 1.15.0 provides numerous bugfixes, including non-numeric
+`thrust::sequence` support, several MSVC-related compilation fixes, fewer
+conversion warnings, `counting_iterator` initialization, and documentation
+updates.
+
+### Deprecation Notices
+
+**A future version of Thrust will remove support for CUDA Dynamic Parallelism
+(CDP).**
+
+This will only affect calls to Thrust algorithms made from CUDA device-side code
+that currently launches a kernel; such calls will instead execute sequentially
+on the calling GPU thread instead of launching a device-wide kernel.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1507: Allow `thrust::sequence` to work with non-numeric types.
+  Thanks to Ben Jude (@bjude) for this contribution.
+- NVIDIA/thrust#1509: Avoid macro collision when calling `max()` on MSVC. Thanks
+  to Thomas (@tomintheshell) for this contribution.
+- NVIDIA/thrust#1514: Initialize all members in `counting_iterator`'s default
+  constructor.
+- NVIDIA/thrust#1518: Fix `std::allocator_traits` on MSVC + C++17.
+- NVIDIA/thrust#1530: Fix several `-Wconversion` warnings. Thanks to Matt
+  Stack (@matt-stack) for this contribution.
+- NVIDIA/thrust#1539: Fixed typo in `thrust::for_each` documentation. Thanks to
+  Salman (@untamedImpala) for this contribution.
+- NVIDIA/thrust#1548: Avoid name collision with `B0` macro in termios.h system
+  header. Thanks to Philip Deegan (@PhilipDeegan) for this contribution.
+
+## Thrust 1.14.0 (NVIDIA HPC SDK 21.9)
+
+Thrust 1.14.0 is a major release accompanying the NVIDIA HPC SDK 21.9.
+
+This release adds the ability to wrap the `thrust::` namespace in an external
+namespace, providing a workaround for a variety of shared library linking
+issues. Thrust also learned to detect when CUB's symbols are in a wrapped
+namespace and properly import them. To enable this feature, use
+`#define THRUST_CUB_WRAPPED_NAMESPACE foo` to wrap both Thrust and CUB in the
+`foo::` namespace. See `thrust/detail/config/namespace.h` for details and more
+namespace options.
+
+Several bugfixes are also included: The `tuple_size` and `tuple_element` helpers
+now support cv-qualified types. `scan_by_key` uses less memory.
+`thrust::iterator_traits` is better integrated with `std::iterator_traits`.
+See below for more details and references.
+
+### Breaking Changes
+
+- Thrust 1.14.0 included a change that aliased the `cub` namespace
+  to `thrust::cub`. This has caused issues with ambiguous namespaces for
+  projects that declare `using namespace thrust;` from the global namespace. We
+  recommend against this practice.
+
+### New Features
+
+- NVIDIA/thrust#1464: Add preprocessor hooks that allow `thrust::` to be wrapped
+  in an external namespace, and support cases when CUB is wrapped in an external
+  namespace.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1457: Support cv-qualified types in `thrust::tuple_size` and
+  `thrust::tuple_element`. Thanks to Jake Hemstad for this contribution.
+- NVIDIA/thrust#1471: Fixed excessive memory allocation in `scan_by_key`. Thanks
+  to Lilo Huang for this contribution.
+- NVIDIA/thrust#1476: Removed dead code from the `expand` example. Thanks to
+  Lilo Huang for this contribution.
+- NVIDIA/thrust#1488: Fixed the path to the installed CUB headers in the CMake
+  `find_package` configuration files.
+- NVIDIA/thrust#1491: Fallback to `std::iterator_traits` when no
+  `thrust::iterator_traits` specialization exists for an iterator type. Thanks
+  to Divye Gala for this contribution.
+
+## Thrust 1.13.1 (CUDA Toolkit 11.5)
+
+Thrust 1.13.1 is a minor release accompanying the CUDA Toolkit 11.5.
+
+This release provides a new hook for embedding the `thrust::` namespace inside a
+custom namespace. This is intended to work around various issues related to
+linking multiple shared libraries that use Thrust. The existing `CUB_NS_PREFIX`
+and `CUB_NS_POSTFIX` macros already provided this capability for CUB; this
+update provides a simpler mechanism that is extended to and integrated with
+Thrust. Simply define `THRUST_CUB_WRAPPED_NAMESPACE` to a namespace name, and
+both `thrust::` and `cub::` will be placed inside the new namespace. Using
+different wrapped namespaces for each shared library will prevent issues like
+those reported in NVIDIA/thrust#1401.
+
+### New Features
+
+- NVIDIA/thrust#1464: Add `THRUST_CUB_WRAPPED_NAMESPACE` hooks.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1488: Fix path to installed CUB in Thrust's CMake config files.
+
+## Thrust 1.13.0 (NVIDIA HPC SDK 21.7)
+
+Thrust 1.13.0 is the major release accompanying the NVIDIA HPC SDK 21.7 release.
+Notable changes include `bfloat16` radix sort support (via `thrust::sort`) and
+  memory handling fixes in the `reserve` method of Thrust's vectors.
+The `CONTRIBUTING.md` file has been expanded to include instructions for
+  building CUB as a component of Thrust, and API documentation now refers to
+  [cppreference](https://cppreference.com) instead of SGI's old STL reference.
+
+### Breaking Changes
+
+- NVIDIA/thrust#1459: Remove deprecated aliases `thrust::host_space_tag` and
+  `thrust::device_space_tag`. Use the equivalent `thrust::host_system_tag` and
+  `thrust::device_system_tag` instead.
+
+### New Features
+
+- NVIDIA/cub#306: Add radix-sort support for `bfloat16` in `thrust::sort`.
+  Thanks to Xiang Gao (@zasdfgbnm) for this contribution.
+- NVIDIA/thrust#1423: `thrust::transform_iterator` now supports non-copyable
+  types. Thanks to Jake Hemstad (@jrhemstad) for this contribution.
+- NVIDIA/thrust#1459: Introduce a new `THRUST_IGNORE_DEPRECATED_API` macro that
+  disables deprecation warnings on Thrust and CUB APIs.
+
+### Bug Fixes
+
+- NVIDIA/cub#277: Fixed sanitizer warnings when `thrust::sort` calls
+  into `cub::DeviceRadixSort`. Thanks to Andy Adinets (@canonizer) for this
+  contribution.
+- NVIDIA/thrust#1442: Reduce extraneous comparisons in `thrust::sort`'s merge
+  sort implementation.
+- NVIDIA/thrust#1447: Fix memory leak and avoid overallocation when
+  calling `reserve` on Thrust's vector containers. Thanks to Kai Germaschewski
+  (@germasch) for this contribution.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1405: Update links to standard C++ documentations from sgi to
+  cppreference. Thanks to Muhammad Adeel Hussain (@AdeilH) for this
+  contribution.
+- NVIDIA/thrust#1432: Updated build instructions in `CONTRIBUTING.md` to include
+  details on building CUB's test suite as part of Thrust.
+
+## Thrust 1.12.1 (CUDA Toolkit 11.4)
+
+Thrust 1.12.1 is a trivial patch release that slightly changes the phrasing of
+a deprecation message.
+
+## Thrust 1.12.0 (NVIDIA HPC SDK 21.3)
+
+Thrust 1.12.0 is the major release accompanying the NVIDIA HPC SDK 21.3
+  and the CUDA Toolkit 11.4.
+It includes a new `thrust::universal_vector`, which holds data that is
+  accessible from both host and device. This allows users to easily leverage
+  CUDA's unified memory with Thrust.
+New asynchronous `thrust::async:exclusive_scan` and `inclusive_scan` algorithms
+  have been added, and the synchronous versions of these have been updated to
+  use `cub::DeviceScan` directly.
+CUB radix sort for floating point types is now stable when both +0.0 and -0.0
+  are present in the input. This affects some usages of `thrust::sort` and
+  `thrust::stable_sort`.
+Many compilation warnings and subtle overflow bugs were fixed in the device
+  algorithms, including a long-standing bug that returned invalid temporary
+  storage requirements when `num_items` was close to (but not
+  exceeding) `INT32_MAX`.
+This release deprecates support for Clang < 7.0 and MSVC < 2019 (aka
+  19.20/16.0/14.20).
+
+### Breaking Changes
+
+- NVIDIA/thrust#1372: Deprecate Clang < 7 and MSVC < 2019.
+- NVIDIA/thrust#1376: Standardize `thrust::scan_by_key` functors / accumulator
+    types.
+  This may change the results from `scan_by_key` when input, output, and
+    initial value types are not the same type.
+
+### New Features
+
+- NVIDIA/thrust#1251: Add two new `thrust::async::` algorithms: `inclusive_scan`
+    and `exclusive_scan`.
+- NVIDIA/thrust#1334: Add `thrust::universal_vector`, `universal_ptr`,
+    and `universal_allocator`.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1347: Qualify calls to `make_reverse_iterator`.
+- NVIDIA/thrust#1359: Enable stricter warning flags. This fixes several
+  outstanding issues:
+  - NVIDIA/cub#221: Overflow in `temp_storage_bytes` when `num_items` close to
+      (but not over) `INT32_MAX`.
+  - NVIDIA/cub#228: CUB uses non-standard C++ extensions that break strict
+      compilers.
+  - NVIDIA/cub#257: Warning when compiling `GridEvenShare` with unsigned
+      offsets.
+  - NVIDIA/thrust#974: Conversion warnings in `thrust::transform_reduce`.
+  - NVIDIA/thrust#1091: Conversion warnings in `thrust::counting_iterator`.
+- NVIDIA/thrust#1373: Fix compilation error when a standard library type is
+    wrapped in `thrust::optional`.
+  Thanks to Vukasin Milovanovic for this contribution.
+- NVIDIA/thrust#1388: Fix `signbit(double)` implementation on MSVC.
+- NVIDIA/thrust#1389: Support building Thrust tests without CUDA enabled.
+
+### Other Enhancements
+
+- NVIDIA/thrust#1304: Use `cub::DeviceScan` to implement
+    `thrust::exclusive_scan` and `thrust::inclusive_scan`.
+- NVIDIA/thrust#1362, NVIDIA/thrust#1370: Update smoke test naming.
+- NVIDIA/thrust#1380: Fix typos in `set_operation` documentation.
+    Thanks to Hongyu Cai for this contribution.
+- NVIDIA/thrust#1383: Include FreeBSD license in LICENSE.md for
+  `thrust::complex` implementation.
+- NVIDIA/thrust#1384: Add missing precondition to `thrust::gather`
+    documentation.
+
+## Thrust 1.11.0 (CUDA Toolkit 11.3)
+
+Thrust 1.11.0 is a major release providing bugfixes and performance
+  enhancements.
+It includes a new sort algorithm that provides up to 2x more performance
+  from `thrust::sort` when used with certain key types and hardware.
+The new `thrust::shuffle` algorithm has been tweaked to improve the randomness
+  of the output.
+Our CMake package and build system continue to see improvements with
+  better `add_subdirectory` support, installation rules, status messages, and
+  other features that make Thrust easier to use from CMake projects.
+The release includes several other bugfixes and modernizations, and received
+  updates from 12 contributors.
+
+### New Features
+
+- NVIDIA/cub#204: New implementation for `thrust::sort` on CUDA when using
+    32/64-bit numeric keys on Pascal and up (SM60+).
+  This improved radix sort algorithm provides up to 2x more performance.
+  Thanks for Andy Adinets for this contribution.
+- NVIDIA/thrust#1310, NVIDIA/thrust#1312: Various tuple-related APIs have been
+    updated to use variadic templates.
+  Thanks for Andrew Corrigan for these contributions.
+- NVIDIA/thrust#1297: Optionally add install rules when included with
+    CMake's `add_subdirectory`.
+  Thanks to Kai Germaschewski for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1309: Fix `thrust::shuffle` to produce better quality random
+    distributions.
+  Thanks to Rory Mitchell and Daniel Stokes for this contribution.
+- NVIDIA/thrust#1337: Fix compile-time regression in `transform_inclusive_scan`
+    and `transform_exclusive_scan`.
+- NVIDIA/thrust#1306: Fix binary search `middle` calculation to avoid overflows.
+    Thanks to Richard Barnes for this contribution.
+- NVIDIA/thrust#1314: Use `size_t` for the index type parameter
+    in `thrust::tuple_element`.
+  Thanks to Andrew Corrigan for this contribution.
+- NVIDIA/thrust#1329: Fix runtime error when copying an empty
+    `thrust::device_vector` in MSVC Debug builds.
+  Thanks to Ben Jude for this contribution.
+- NVIDIA/thrust#1323: Fix and add test for cmake package install rules.
+  Thanks for Keith Kraus and Kai Germaschewski for testing and discussion.
+- NVIDIA/thrust#1338: Fix GCC version checks in `thrust::detail::is_pod`
+    implementation.
+  Thanks to Anatoliy Tomilov for this contribution.
+- NVIDIA/thrust#1289: Partial fixes for Clang 10 as host compiler.
+  Filed an NVCC bug that will be fixed in a future version of the CUDA Toolkit
+    (NVBug 3136307).
+- NVIDIA/thrust#1272: Fix ambiguous `iter_swap` call when
+    using `thrust::partition` with STL containers.
+  Thanks to Isaac Deutsch for this contribution.
+- NVIDIA/thrust#1281: Update our bundled `FindTBB.cmake` module to support
+    latest MSVC.
+- NVIDIA/thrust#1298: Use semantic versioning rules for our CMake package's
+    compatibility checks.
+  Thanks to Kai Germaschewski for this contribution.
+- NVIDIA/thrust#1300: Use `FindPackageHandleStandardArgs` to print standard
+    status messages when our CMake package is found.
+  Thanks to Kai Germaschewski for this contribution.
+- NVIDIA/thrust#1320: Use feature-testing instead of a language dialect check
+    for `thrust::remove_cvref`.
+  Thanks to Andrew Corrigan for this contribution.
+- NVIDIA/thrust#1319: Suppress GPU deprecation warnings.
+
+### Other Enhancements
+
+- NVIDIA/cub#213: Removed some tuning policies for unsupported hardware (<SM35).
+- References to the old Github repository and branch names were updated.
+  - Github's `thrust/cub` repository is now `NVIDIA/cub`.
+  - Development has moved from the `master` branch to the `main` branch.
+
+## Thrust 1.10.0 (NVIDIA HPC SDK 20.9, CUDA Toolkit 11.2)
+
+Thrust 1.10.0 is the major release accompanying the NVIDIA HPC SDK 20.9 release
+  and the CUDA Toolkit 11.2 release.
+It drops support for C++03, GCC < 5, Clang < 6, and MSVC < 2017.
+It also overhauls CMake support.
+Finally, we now have a Code of Conduct for contributors:
+https://github.com/NVIDIA/thrust/blob/main/CODE_OF_CONDUCT.md
+
+### Breaking Changes
+
+- C++03 is no longer supported.
+- GCC < 5, Clang < 6, and MSVC < 2017 are no longer supported.
+- C++11 is deprecated.
+  Using this dialect will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` or `THRUST_IGNORE_DEPRECATED_CPP_11`.
+  Suppression is only a short term solution.
+  We will be dropping support for C++11 in the near future.
+- Asynchronous algorithms now require C++14.
+- CMake < 3.15 is no longer supported.
+- The default branch on GitHub is now called `main`.
+- Allocator and vector classes have been replaced with alias templates.
+
+### New Features
+
+- NVIDIA/thrust#1159: CMake multi-config support, which allows multiple
+    combinations of host and device systems to be built and tested at once.
+  More details can be found here: https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md#multi-config-cmake-options
+- CMake refactoring:
+  - Added install targets to CMake builds.
+  - Added support for CUB tests and examples.
+  - Thrust can be added to another CMake project by calling `add_subdirectory`
+      with the Thrust source root (see NVIDIA/thrust#976).
+    An example can be found here:
+      https://github.com/NVIDIA/thrust/blob/main/examples/cmake/add_subdir/CMakeLists.txt
+  - CMake < 3.15 is no longer supported.
+  - Dialects are now configured through target properties.
+    A new `THRUST_CPP_DIALECT` option has been added for single config mode.
+    Logic that modified `CMAKE_CXX_STANDARD` and `CMAKE_CUDA_STANDARD` has been
+      eliminated.
+  - Testing related CMake code has been moved to `testing/CMakeLists.txt`
+  - Example related CMake code has been moved to `examples/CMakeLists.txt`
+  - Header testing related CMake code has been moved to `cmake/ThrustHeaderTesting.cmake`
+  - CUDA configuration CMake code has been moved to to `cmake/ThrustCUDAConfig.cmake`.
+  - Now we explicitly `include(cmake/*.cmake)` files rather than searching
+      `CMAKE_MODULE_PATH` - we only want to use the ones in the repo.
+- `thrust::transform_input_output_iterator`, a variant of transform iterator
+    adapter that works as both an input iterator and an output iterator.
+  The given input function is applied after reading from the wrapped iterator
+    while the output function is applied before writing to the wrapped iterator.
+  Thanks to Trevor Smith for this contribution.
+
+### Other Enhancements
+
+- Contributor documentation: https://github.com/NVIDIA/thrust/blob/main/CONTRIBUTING.md
+- Code of Conduct: https://github.com/NVIDIA/thrust/blob/main/CODE_OF_CONDUCT.md.
+  Thanks to Conor Hoekstra for this contribution.
+- Support for all combinations of host and device systems.
+- C++17 support.
+- NVIDIA/thrust#1221: Allocator and vector classes have been replaced with
+    alias templates.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1186: Use placeholder expressions to simplify the definitions
+    of a number of algorithms.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1170: More conforming semantics for scan algorithms:
+  - Follow P0571's guidance regarding intermediate types.
+    - https://wg21.link/P0571
+    - The accumulator's type is now:
+      - The type of the user-supplied initial value (if provided), or
+      - The input iterator's value type if no initial value.
+  - Follow C++ standard guidance for default binary operator type.
+    - https://eel.is/c++draft/exclusive.scan#1
+    - Thrust binary/unary functors now specialize a default void template
+        parameter.
+      Types are deduced and forwarded transparently.
+    - Updated the scan's default binary operator to the new `thrust::plus<>`
+        specialization.
+  - The `thrust::intermediate_type_from_function_and_iterators` helper is no
+      longer needed and has been removed.
+- NVIDIA/thrust#1255: Always use `cudaStreamSynchronize` instead of
+    `cudaDeviceSynchronize` if the execution policy has a stream attached to it.
+  Thanks to Rong Ou for this contribution.
+- NVIDIA/thrust#1201: Tests for correct handling of legacy and per-thread
+    default streams.
+  Thanks to Rong Ou for this contribution.
+
+### Bug Fixes
+
+- NVIDIA/thrust#1260: Fix `thrust::transform_inclusive_scan` with heterogeneous
+    types.
+  Thanks to Rong Ou for this contribution.
+- NVIDIA/thrust#1258, NVC++ FS #28463: Ensure the CUDA radix sort backend
+    synchronizes before returning; otherwise, copies from temporary storage will
+    race with destruction of said temporary storage.
+- NVIDIA/thrust#1264: Evaluate `CUDA_CUB_RET_IF_FAIL` macro argument only once.
+  Thanks to Jason Lowe for this contribution.
+- NVIDIA/thrust#1262: Add missing `<stdexcept>` header.
+- NVIDIA/thrust#1250: Restore some `THRUST_DECLTYPE_RETURNS` macros in async
+    test implementations.
+- NVIDIA/thrust#1249: Use `std::iota` in `CUDATestDriver::target_devices`.
+  Thanks to Michael Francis for this contribution.
+- NVIDIA/thrust#1244: Check for macro collisions with system headers during
+    header testing.
+- NVIDIA/thrust#1224: Remove unnecessary SFINAE contexts from asynchronous
+    algorithms.
+- NVIDIA/thrust#1190: Make `out_of_memory_recovery` test trigger faster.
+- NVIDIA/thrust#1187: Elminate superfluous iterators specific to the CUDA
+    backend.
+- NVIDIA/thrust#1181: Various fixes for GoUDA.
+  Thanks to Andrei Tchouprakov for this contribution.
+- NVIDIA/thrust#1178, NVIDIA/thrust#1229: Use transparent functionals in
+    placeholder expressions, fixing issues with `thrust::device_reference` and
+    placeholder expressions and `thrust::find` with asymmetric equality
+    operators.
+- NVIDIA/thrust#1153: Switch to placement new instead of assignment to
+    construct items in uninitialized memory.
+  Thanks to Hugh Winkler for this contribution.
+- NVIDIA/thrust#1050: Fix compilation of asynchronous algorithms when RDC is
+    enabled.
+- NVIDIA/thrust#1042: Correct return type of
+    `thrust::detail::predicate_to_integral` from `bool` to `IntegralType`.
+  Thanks to Andreas Hehn for this contribution.
+- NVIDIA/thrust#1009: Avoid returning uninitialized allocators.
+  Thanks to Zhihao Yuan for this contribution.
+- NVIDIA/thrust#990: Add missing `<thrust/system/cuda/memory.h>` include to
+    `<thrust/system/cuda/detail/malloc_and_free.h>`.
+  Thanks to Robert Maynard for this contribution.
+- NVIDIA/thrust#966: Fix spurious MSVC conversion with loss of data warning in
+    sort algorithms.
+  Thanks to Zhihao Yuan for this contribution.
+- Add more metadata to mock specializations for testing iterator in
+   `testing/copy.cu`.
+- Add missing include to shuffle unit test.
+- Specialize `thrust::wrapped_function` for `void` return types because MSVC is
+    not a fan of the pattern `return static_cast<void>(expr);`.
+- Replace deprecated `tbb/tbb_thread.h` with `<thread>`.
+- Fix overcounting of initial value in TBB scans.
+- Use `thrust::advance` instead of `+=` for generic iterators.
+- Wrap the OMP flags in `-Xcompiler` for NVCC
+- Extend `ASSERT_STATIC_ASSERT` skip for the OMP backend.
+- Add missing header caught by `tbb.cuda` configs.
+- Fix "unsafe API" warnings in examples on MSVC: `s/fopen/fstream/`
+- Various C++17 fixes.
+
+## Thrust 1.9.10-1 (NVIDIA HPC SDK 20.7, CUDA Toolkit 11.1)
+
+Thrust 1.9.10-1 is the minor release accompanying the NVIDIA HPC SDK 20.7 release
+  and the CUDA Toolkit 11.1 release.
+
+### Bug Fixes
+
+- #1214, NVBug 200619442: Stop using `std::allocator` APIs deprecated in C++17.
+- #1216, NVBug 200540293: Make `thrust::optional` work with Clang when used
+    with older libstdc++.
+- #1207, NVBug 200618218: Don't force C++14 with older compilers that don't
+    support it.
+- #1218: Wrap includes of `<memory>` and `<algorithm>` to avoid circular
+    inclusion with NVC++.
+
+## Thrust 1.9.10 (NVIDIA HPC SDK 20.5)
+
+Thrust 1.9.10 is the release accompanying the NVIDIA HPC SDK 20.5 release.
+It adds CMake support for compilation with NVC++ and a number of minor bug fixes
+  for NVC++.
+It also adds CMake `find_package` support, which replaces the broken 3rd-party
+  legacy `FindThrust.cmake` script.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+
+### Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+### New Features
+
+- #1130: CMake `find_package` support.
+  This is significant because there is a legacy `FindThrust.cmake` script
+    authored by a third party in widespread use in the community which has a
+    bug in how it parses Thrust version numbers which will cause it to
+    incorrectly parse 1.9.10.
+  This script only handles the first digit of each part of the Thrust version
+    number correctly: for example, Thrust 17.17.17 would be interpreted as
+    Thrust 1.1.1701717.
+  You can find directions for using the new CMake `find_package` support and
+    migrating away from the legacy `FindThrust.cmake` [here](https://github.com/NVIDIA/thrust/blob/main/thrust/cmake/README.md)
+- #1129: Added `thrust::detail::single_device_tls_caching_allocator`, a
+    convenient way to get an MR caching allocator for device memory, which is
+    used by NVC++.
+
+### Other Enhancements
+
+- #1129: Refactored RDC handling in CMake to be a global option and not create
+    two targets for each example and test.
+
+### Bug Fixes
+
+- #1129: Fix the legacy `thrust::return_temporary_buffer` API to support
+    passing a size.
+  This was necessary to enable usage of Thrust caching MR allocators with
+    synchronous Thrust algorithms.
+  This change has allowed NVC++'s C++17 Parallel Algorithms implementation to
+    switch to use Thrust caching MR allocators for device temporary storage,
+    which gives a 2x speedup on large multi-GPU systems such as V100 and A100
+    DGX where `cudaMalloc` is very slow.
+- #1128: Respect `CUDA_API_PER_THREAD_DEFAULT_STREAM`.
+  Thanks to Rong Ou for this contribution.
+- #1131: Fix the one-policy overload of `thrust::async::copy` to not copy the
+    policy, resolving use-afer-move issues.
+- #1145: When cleaning up type names in `unittest::base_class_name`, only call
+    `std::string::replace` if we found the substring we are looking to replace.
+- #1139: Don't use `cxx::__demangle` in NVC++.
+- #1102: Don't use `thrust::detail::normal_distribution_nvcc` for Feta because
+    it uses `erfcinv`, a non-standard function that Feta doesn't have.
+
+## Thrust 1.9.9 (CUDA Toolkit 11.0)
+
+Thrust 1.9.9 adds support for NVC++, which uses Thrust to implement
+  GPU-accelerated C++17 Parallel Algorithms.
+`thrust::zip_function` and `thrust::shuffle` were also added.
+C++03, C++11, GCC < 5, Clang < 6, and MSVC < 2017 are now deprecated.
+Starting with the upcoming 1.10.0 release, C++03 support will be dropped
+  entirely.
+All other deprecated platforms will be dropped in the near future.
+
+### Breaking Changes
+
+- #1082: Thrust now checks that it is compatible with the version of CUB found
+    in your include path, generating an error if it is not.
+  If you are using your own version of CUB, it may be too old.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+- #1089: C++03 and C++11 are deprecated.
+  Using these dialects will generate a compile-time warning.
+  These warnings can be suppressed by defining
+    `THRUST_IGNORE_DEPRECATED_CPP_DIALECT` (to suppress C++03 and C++11
+    deprecation warnings) or `THRUST_IGNORE_DEPRECATED_CPP_11` (to suppress C++11
+    deprecation warnings).
+  Suppression is only a short term solution.
+  We will be dropping support for C++03 in the 1.10.0 release and C++11 in the
+    near future.
+- #1089: GCC < 5, Clang < 6, and MSVC < 2017 are deprecated.
+  Using these compilers will generate a compile-time warning.
+  These warnings can be suppressed by defining
+  `THRUST_IGNORE_DEPRECATED_COMPILER`.
+  Suppression is only a short term solution.
+  We will be dropping support for these compilers in the near future.
+
+### New Features
+
+- #1086: Support for NVC++ aka "Feta".
+  The most significant change is in how we use `__CUDA_ARCH__`.
+  Now, there are four macros that must be used:
+  - `THRUST_IS_DEVICE_CODE`, which should be used in an `if` statement around
+      device-only code.
+  - `THRUST_INCLUDE_DEVICE_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+  - `THRUST_IS_HOST_CODE`, which should be used in an `if` statement around
+      host-only code.
+  - `THRUST_INCLUDE_HOST_CODE`, which should be used in an `#if` preprocessor
+      directive inside of the `if` statement mentioned in the prior bullet.
+- #1085: `thrust::shuffle`.
+  Thanks to Rory Mitchell for this contribution.
+- #1029: `thrust::zip_function`, a facility for zipping functions that take N
+    parameters instead of a tuple of N parameters as `thrust::zip_iterator`
+    does.
+  Thanks to Ben Jude for this contribution.
+- #1068: `thrust::system::cuda::managed_memory_pointer`, a universal memory
+    strongly typed pointer compatible with the ISO C++ Standard Library.
+
+### Other Enhancements
+
+- #1029: Thrust is now built and tested with NVCC warnings treated as errors.
+- #1029: MSVC C++11 support.
+- #1029: `THRUST_DEPRECATED` abstraction for generating compile-time
+    deprecation warning messages.
+- #1029: `thrust::pointer<T>::pointer_to(reference)`.
+- #1070: Unit test for `thrust::inclusive_scan` with a user defined types.
+  Thanks to Conor Hoekstra for this contribution.
+
+### Bug Fixes
+
+- #1088: Allow `thrust::replace` to take functions that have non-`const`
+    `operator()`.
+- #1094: Add missing `constexpr` to `par_t` constructors.
+  Thanks to Patrick Stotko for this contribution.
+- #1077: Remove `__device__` from CUDA MR-based device allocators to fix
+    obscure "host function called from host device function" warning that occurs
+    when you use the new Thrust MR-based allocators.
+- #1029: Remove inconsistently-used `THRUST_BEGIN`/`END_NS` macros.
+- #1029: Fix C++ dialect detection on newer MSVC.
+- #1029 Use `_Pragma`/`__pragma` instead of `#pragma` in macros.
+- #1029: Replace raw `__cplusplus` checks with the appropriate Thrust macros.
+- #1105: Add a missing `<math.h>` include.
+- #1103: Fix regression of `thrust::detail::temporary_allocator` with non-CUDA
+    back ends.
+- #1111: Use Thrust's random number engine instead of `std::`s in device code.
+- #1108: Get rid of a GCC 9 warning about deprecated generation of copy ctors.
+
+## Thrust 1.9.8-1 (NVIDIA HPC SDK 20.3)
+
+Thrust 1.9.8-1 is a variant of 1.9.8 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 11.0
+  release.
+
+## Thrust 1.9.8 (CUDA Toolkit 11.0 Early Access)
+
+Thrust 1.9.8, which is included in the CUDA Toolkit 11.0 release, removes
+  Thrust's internal derivative of CUB, upstreams all relevant changes too CUB,
+  and adds CUB as a Git submodule.
+It will now be necessary to do `git clone --recursive` when checking out
+  Thrust, and to update the CUB submodule when pulling in new Thrust changes.
+Additionally, CUB is now included as a first class citizen in the CUDA toolkit.
+Thrust 1.9.8 also fixes bugs preventing most Thrust algorithms from working
+  with more than `2^31-1` elements.
+Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+  Thrust) work with large element counts.
+
+### Breaking Changes
+
+- Thrust will now use the version of CUB in your include path instead of its own
+    internal copy.
+  If you are using your own version of CUB, it may be older and incompatible
+    with Thrust.
+  It is recommended to simply delete your own version of CUB and use the
+    version of CUB that comes with Thrust.
+
+### Other Enhancements
+
+- Refactor Thrust and CUB to support 64-bit indices in most algorithms.
+  In most cases, Thrust now selects between kernels that use 32-bit indices and
+    64-bit indices at runtime depending on the size of the input.
+  This means large element counts work, but small element counts do not have to
+    pay for the register usage of 64-bit indices if they are not needed.
+  Now, `thrust::reduce`, `thrust::*_scan`, and related algorithms (aka most of
+    Thrust) work with more than `2^31-1` elements.
+  Notably, `thrust::sort` is still limited to less than `2^31-1` elements.
+- CUB is now a submodule and the internal copy of CUB has been removed.
+- #1051: Stop specifying the `__launch_bounds__` minimum blocks parameter
+    because it messes up register allocation and increases register pressure,
+    and we don't actually know at compile time how many blocks we will use
+    (aside from single tile kernels).
+
+### Bug Fixes
+
+- #1020: After making a CUDA API call, always clear the global CUDA error state
+    by calling `cudaGetLastError`.
+- #1021: Avoid calling destroy in the destructor of a Thrust vector if the
+    vector is empty.
+- #1046: Actually throw `thrust::bad_alloc` when `thrust::system::cuda::malloc`
+    fails instead of just constructing a temporary and doing nothing with it.
+- Add missing copy constructor or copy assignment operator to all classes that
+    GCC 9's `-Wdeprecated-copy` complains about
+- Add missing move operations to `thrust::system::cuda::vector`.
+- #1015: Check that the backend is CUDA before using CUDA-specifics in
+    `thrust::detail::temporary_allocator`.
+  Thanks to Hugh Winkler for this contribution.
+- #1055: More correctly detect the presence of aligned/sized `new`/`delete`.
+- #1043: Fix ill-formed specialization of `thrust::system::is_error_code_enum`
+    for `thrust::event_errc`.
+  Thanks to Toru Niina for this contribution.
+- #1027: Add tests for `thrust::tuple_for_each` and `thrust::tuple_subset`.
+  Thanks to Ben Jude for this contribution.
+- #1027: Use correct macro in `thrust::tuple_for_each`.
+  Thanks to Ben Jude for this contribution.
+- #1026: Use correct MSVC version formatting in CMake.
+  Thanks to Ben Jude for this contribution.
+- Workaround an NVCC issue with type aliases with template template arguments
+    containing a parameter pack.
+- Remove unused functions from the CUDA backend which call slow CUDA attribute
+    query APIs.
+- Replace `CUB_RUNTIME_FUNCTION` with `THRUST_RUNTIME_FUNCTION`.
+- Correct typo in `thrust::transform` documentation.
+  Thanks to Eden Yefet for this contribution.
+
+### Known Issues
+
+- `thrust::sort` remains limited to `2^31-1` elements for now.
+
+## Thrust 1.9.7-1 (CUDA Toolkit 10.2 for Tegra)
+
+Thrust 1.9.7-1 is a minor release accompanying the CUDA Toolkit 10.2 release
+  for Tegra.
+It is nearly identical to 1.9.7.
+
+### Bug Fixes
+
+- Remove support for GCC's broken nodiscard-like attribute.
+
+## Thrust 1.9.7 (CUDA Toolkit 10.2)
+
+Thrust 1.9.7 is a minor release accompanying the CUDA Toolkit 10.2 release.
+Unfortunately, although the version and patch numbers are identical, one bug
+  fix present in Thrust 1.9.7 (NVBug 2646034: Fix incorrect dependency handling
+  for stream acquisition in `thrust::future`) was not included in the CUDA
+  Toolkit 10.2 preview release for AArch64 SBSA.
+The tag `cuda-10.2aarch64sbsa` contains the exact version of Thrust present
+  in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+
+### Bug Fixes
+
+- #967, NVBug 2448170: Fix the CUDA backend `thrust::for_each` so that it
+    supports large input sizes with 64-bit indices.
+- NVBug 2646034: Fix incorrect dependency handling for stream acquisition in
+    `thrust::future`.
+  - Not present in the CUDA Toolkit 10.2 preview release for AArch64 SBSA.
+- #968, NVBug 2612102: Fix the `thrust::mr::polymorphic_adaptor` to actually
+    use its template parameter.
+
+## Thrust 1.9.6-1 (NVIDIA HPC SDK 20.3)
+
+Thrust 1.9.6-1 is a variant of 1.9.6 accompanying the NVIDIA HPC SDK 20.3
+  release.
+It contains modifications necessary to serve as the implementation of NVC++'s
+  GPU-accelerated C++17 Parallel Algorithms when using the CUDA Toolkit 10.1
+  Update 2 release.
+
+## Thrust 1.9.6 (CUDA Toolkit 10.1 Update 2)
+
+Thrust 1.9.6 is a minor release accompanying the CUDA Toolkit 10.1 Update 2
+  release.
+
+### Bug Fixes
+
+- NVBug 2509847: Inconsistent alignment of `thrust::complex`
+- NVBug 2586774: Compilation failure with Clang + older libstdc++ that doesn't
+    have `std::is_trivially_copyable`
+- NVBug 200488234: CUDA header files contain Unicode characters which leads
+    compiling errors on Windows
+- #949, #973, NVBug 2422333, NVBug 2522259, NVBug 2528822:
+    `thrust::detail::aligned_reinterpret_cast` must be annotated with
+    `__host__ __device__`.
+- NVBug 2599629: Missing include in the OpenMP sort implementation
+- NVBug 200513211: Truncation warning in test code under VC142
+
+## Thrust 1.9.5 (CUDA Toolkit 10.1 Update 1)
+
+Thrust 1.9.5 is a minor release accompanying the CUDA Toolkit 10.1 Update 1
+  release.
+
+### Bug Fixes
+
+- NVBug 2502854: Fixed assignment of
+    `thrust::device_vector<thrust::complex<T>>` between host and device.
+
+## Thrust 1.9.4 (CUDA Toolkit 10.1)
+
+Thrust 1.9.4 adds asynchronous interfaces for parallel algorithms, a new
+  allocator system including caching allocators and unified memory support, as
+  well as a variety of other enhancements, mostly related to
+  C++11/C++14/C++17/C++20 support.
+The new asynchronous algorithms in the `thrust::async` namespace return
+  `thrust::event` or `thrust::future` objects, which can be waited upon to
+  synchronize with the completion of the parallel operation.
+
+### Breaking Changes
+
+Synchronous Thrust algorithms now block until all of their operations have
+  completed.
+Use the new asynchronous Thrust algorithms for non-blocking behavior.
+
+### New Features
+
+- `thrust::event` and `thrust::future<T>`, uniquely-owned asynchronous handles
+    consisting of a state (ready or not ready), content (some value; for
+    `thrust::future` only), and an optional set of objects that should be
+    destroyed only when the future's value is ready and has been consumed.
+  - The design is loosely based on C++11's `std::future`.
+  - They can be `.wait`'d on, and the value of a future can be waited on and
+      retrieved with `.get` or `.extract`.
+  - Multiple `thrust::event`s and `thrust::future`s can be combined with
+      `thrust::when_all`.
+  - `thrust::future`s can be converted to `thrust::event`s.
+  - Currently, these primitives are only implemented for the CUDA backend and
+      are C++11 only.
+- New asynchronous algorithms that return `thrust::event`/`thrust::future`s,
+    implemented as C++20 range style customization points:
+    - `thrust::async::reduce`.
+    - `thrust::async::reduce_into`, which takes a target location to store the
+        reduction result into.
+    - `thrust::async::copy`, including a two-policy overload that allows
+        explicit cross system copies which execution policy properties can be
+        attached to.
+    - `thrust::async::transform`.
+    - `thrust::async::for_each`.
+    - `thrust::async::stable_sort`.
+    - `thrust::async::sort`.
+    - By default the asynchronous algorithms use the new caching allocators.
+        Deallocation of temporary storage is deferred until the destruction of
+        the returned `thrust::future`. The content of `thrust::future`s is
+        stored in either device or universal memory and transferred to the host
+        only upon request to prevent unnecessary data migration.
+    - Asynchronous algorithms are currently only implemented for the CUDA
+        system and are C++11 only.
+- `exec.after(f, g, ...)`, a new execution policy method that takes a set of
+    `thrust::event`/`thrust::future`s and returns an execution policy that
+    operations on that execution policy should depend upon.
+- New logic and mindset for the type requirements for cross-system sequence
+    copies (currently only used by `thrust::async::copy`), based on:
+  - `thrust::is_contiguous_iterator` and `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR`
+      for detecting/indicating that an iterator points to contiguous storage.
+  - `thrust::is_trivially_relocatable` and
+      `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE` for detecting/indicating that a
+      type is `memcpy`able (based on principles from
+      [P1144](https://wg21.link/P1144)).
+  - The new approach reduces buffering, increases performance, and increases
+      correctness.
+  - The fast path is now enabled when copying CUDA `__half` and vector types with
+      `thrust::async::copy`.
+- All Thrust synchronous algorithms for the CUDA backend now actually
+    synchronize. Previously, any algorithm that did not allocate temporary
+    storage (counterexample: `thrust::sort`) and did not have a
+    computation-dependent result (counterexample: `thrust::reduce`) would
+    actually be launched asynchronously. Additionally, synchronous algorithms
+    that allocated temporary storage would become asynchronous if a custom
+    allocator was supplied that did not synchronize on allocation/deallocation,
+    unlike `cudaMalloc`/`cudaFree`. So, now `thrust::for_each`,
+    `thrust::transform`, `thrust::sort`, etc are truly synchronous. In some
+    cases this may be a performance regression; if you need asynchrony, use the
+    new asynchronous algorithms.
+- Thrust's allocator framework has been rewritten. It now uses a memory
+    resource system, similar to C++17's `std::pmr` but supporting static
+    polymorphism. Memory resources are objects that allocate untyped storage and
+    allocators are cheap handles to memory resources in this new model. The new
+    facilities live in `<thrust/mr/*>`.
+  - `thrust::mr::memory_resource<Pointer>`, the memory resource base class,
+      which takes a (possibly tagged) pointer to `void` type as a parameter.
+  - `thrust::mr::allocator<T, MemoryResource>`, an allocator backed by a memory
+      resource object.
+  - `thrust::mr::polymorphic_adaptor_resource<Pointer>`, a type-erased memory
+      resource adaptor.
+  - `thrust::mr::polymorphic_allocator<T>`, a C++17-style polymorphic allocator
+      backed by a type-erased memory resource object.
+  - New tunable C++17-style caching memory resources,
+      `thrust::mr::(disjoint_)?(un)?synchronized_pool_resource`, designed to
+      cache both small object allocations and large repetitive temporary
+      allocations. The disjoint variants use separate storage for management of
+      the pool, which is necessary if the memory being allocated cannot be
+      accessed on the host (e.g.  device memory).
+  - System-specific allocators were rewritten to use the new memory resource
+      framework.
+  - New `thrust::device_memory_resource` for allocating device memory.
+  - New `thrust::universal_memory_resource` for allocating memory that can be
+      accessed from both the host and device (e.g. `cudaMallocManaged`).
+  - New `thrust::universal_host_pinned_memory_resource` for allocating memory
+      that can be accessed from the host and the device but always resides in
+      host memory (e.g. `cudaMallocHost`).
+  - `thrust::get_per_device_resource` and `thrust::per_device_allocator`, which
+      lazily create and retrieve a per-device singleton memory resource.
+  - Rebinding mechanisms (`rebind_traits` and `rebind_alloc`) for
+      `thrust::allocator_traits`.
+  - `thrust::device_make_unique`, a factory function for creating a
+      `std::unique_ptr` to a newly allocated object in device memory.
+  - `<thrust/detail/memory_algorithms>`, a C++11 implementation of the C++17
+      uninitialized memory algorithms.
+  - `thrust::allocate_unique` and friends, based on the proposed C++23
+      [`std::allocate_unique`](https://wg21.link/P0211).
+- New type traits and metaprogramming facilities. Type traits are slowly being
+    migrated out of `thrust::detail::` and `<thrust/detail/*>`; their new home
+    will be `thrust::` and `<thrust/type_traits/*>`.
+  - `thrust::is_execution_policy`.
+  - `thrust::is_operator_less_or_greater_function_object`, which detects
+      `thrust::less`, `thrust::greater`, `std::less`, and `std::greater`.
+  - `thrust::is_operator_plus_function_object``, which detects `thrust::plus`
+      and `std::plus`.
+  - `thrust::remove_cvref(_t)?`, a C++11 implementation of C++20's
+      `thrust::remove_cvref(_t)?`.
+  - `thrust::void_t`, and various other new type traits.
+  - `thrust::integer_sequence` and friends, a C++11 implementation of C++20's
+      `std::integer_sequence`
+  - `thrust::conjunction`, `thrust::disjunction`, and `thrust::disjunction`, a
+      C++11 implementation of C++17's logical metafunctions.
+  - Some Thrust type traits (such as `thrust::is_constructible`) have been
+      redefined in terms of C++11's type traits when they are available.
+- `<thrust/detail/tuple_algorithms.h>`, new `std::tuple` algorithms:
+  - `thrust::tuple_transform`.
+  - `thrust::tuple_for_each`.
+  - `thrust::tuple_subset`.
+- Miscellaneous new `std::`-like facilities:
+  - `thrust::optional`, a C++11 implementation of C++17's `std::optional`.
+  - `thrust::addressof`, an implementation of C++11's `std::addressof`.
+  - `thrust::next` and `thrust::prev`, an implementation of C++11's `std::next`
+      and `std::prev`.
+  - `thrust::square`, a `<functional>` style unary function object that
+      multiplies its argument by itself.
+  - `<thrust/limits.h>` and `thrust::numeric_limits`, a customized version of
+      `<limits>` and `std::numeric_limits`.
+- `<thrust/detail/preprocessor.h>`, new general purpose preprocessor facilities:
+  - `THRUST_PP_CAT[2-5]`, concatenates two to five tokens.
+  - `THRUST_PP_EXPAND(_ARGS)?`, performs double expansion.
+  - `THRUST_PP_ARITY` and `THRUST_PP_DISPATCH`, tools for macro overloading.
+  - `THRUST_PP_BOOL`, boolean conversion.
+  - `THRUST_PP_INC` and `THRUST_PP_DEC`, increment/decrement.
+  - `THRUST_PP_HEAD`, a variadic macro that expands to the first argument.
+  - `THRUST_PP_TAIL`, a variadic macro that expands to all its arguments after
+      the first.
+  - `THRUST_PP_IIF`, bitwise conditional.
+  - `THRUST_PP_COMMA_IF`, and `THRUST_PP_HAS_COMMA`, facilities for adding and
+      detecting comma tokens.
+  - `THRUST_PP_IS_VARIADIC_NULLARY`, returns true if called with a nullary
+      `__VA_ARGS__`.
+  - `THRUST_CURRENT_FUNCTION`, expands to the name of the current function.
+- New C++11 compatibility macros:
+  - `THRUST_NODISCARD`, expands to `[[nodiscard]]` when available and the best
+      equivalent otherwise.
+  - `THRUST_CONSTEXPR`, expands to `constexpr` when available and the best
+      equivalent otherwise.
+  - `THRUST_OVERRIDE`, expands to `override` when available and the best
+      equivalent otherwise.
+  - `THRUST_DEFAULT`, expands to `= default;` when available and the best
+      equivalent otherwise.
+  - `THRUST_NOEXCEPT`, expands to `noexcept` when available and the best
+      equivalent otherwise.
+  - `THRUST_FINAL`, expands to `final` when available and the best equivalent
+      otherwise.
+  - `THRUST_INLINE_CONSTANT`, expands to `inline constexpr` when available and
+      the best equivalent otherwise.
+- `<thrust/detail/type_deduction.h>`, new C++11-only type deduction helpers:
+  - `THRUST_DECLTYPE_RETURNS*`, expand to function definitions with suitable
+      conditional `noexcept` qualifiers and trailing return types.
+  - `THRUST_FWD(x)`, expands to `::std::forward<decltype(x)>(x)`.
+  - `THRUST_MVCAP`, expands to a lambda move capture.
+  - `THRUST_RETOF`, expands to a decltype computing the return type of an
+      invocable.
+- New CMake build system.
+
+### New Examples
+
+- `mr_basic` demonstrates how to use the new memory resource allocator system.
+
+### Other Enhancements
+
+- Tagged pointer enhancements:
+  - New `thrust::pointer_traits` specialization for `void const*`.
+  - `nullptr` support to Thrust tagged pointers.
+  - New `explicit operator bool` for Thrust tagged pointers when using C++11
+      for `std::unique_ptr` interoperability.
+  - Added `thrust::reinterpret_pointer_cast` and `thrust::static_pointer_cast`
+      for casting Thrust tagged pointers.
+- Iterator enhancements:
+  - `thrust::iterator_system` is now SFINAE friendly.
+  - Removed cv qualifiers from iterator types when using
+      `thrust::iterator_system`.
+- Static assert enhancements:
+  - New `THRUST_STATIC_ASSERT_MSG`, takes an optional string constant to be
+      used as the error message when possible.
+  - Update `THRUST_STATIC_ASSERT(_MSG)` to use C++11's `static_assert` when
+      it's available.
+  - Introduce a way to test for static assertions.
+- Testing enhancements:
+  - Additional scalar and sequence types, including non-builtin types and
+      vectors with unified memory allocators, have been added to the list of
+      types used by generic unit tests.
+  - The generation of random input data has been improved to increase the range
+      of values used and catch more corner cases.
+  - New `unittest::truncate_to_max_representable` utility for avoiding the
+      generation of ranges that cannot be represented by the underlying element
+      type in generic unit test code.
+  - The test driver now synchronizes with CUDA devices and check for errors
+      after each test, when switching devices, and after each raw kernel launch.
+  - The `warningtester` uber header is now compiled with NVCC to avoid needing
+      to disable CUDA-specific code with the preprocessor.
+  - Fixed the unit test framework's `ASSERT_*` to print `char`s as `int`s.
+  - New `DECLARE_INTEGRAL_VARIABLE_UNITTEST` test declaration macro.
+  - New `DECLARE_VARIABLE_UNITTEST_WITH_TYPES_AND_NAME` test declaration macro.
+  - `thrust::system_error` in the CUDA backend now print out its `cudaError_t`
+      enumerator in addition to the diagnostic message.
+  - Stopped using conditionally signed types like `char`.
+
+### Bug Fixes
+
+- #897, NVBug 2062242: Fix compilation error when using `__device__` lambdas
+    with `thrust::reduce` on MSVC.
+- #908, NVBug 2089386: Static assert that `thrust::generate`/`thrust::fill`
+    isn't operating on const iterators.
+- #919 Fix compilation failure with `thrust::zip_iterator` and
+    `thrust::complex`.
+- #924, NVBug 2096679, NVBug 2315990: Fix dispatch for the CUDA backend's
+    `thrust::reduce` to use two functions (one with the pragma for disabling
+    exec checks, one with `THRUST_RUNTIME_FUNCTION`) instead of one. This fixes
+    a regression with device compilation that started in CUDA Toolkit 9.2.
+- #928, NVBug 2341455: Add missing `__host__ __device__` annotations to a
+    `thrust::complex::operator=` to satisfy GoUDA.
+- NVBug 2094642: Make `thrust::vector_base::clear` not depend on the element
+    type being default constructible.
+- NVBug 2289115: Remove flaky `simple_cuda_streams` example.
+- NVBug 2328572: Add missing `thrust::device_vector` constructor that takes an
+    allocator parameter.
+- NVBug 2455740: Update the `range_view` example to not use device-side launch.
+- NVBug 2455943: Ensure that sized unit tests that use
+    `thrust::counting_iterator` perform proper truncation.
+- NVBug 2455952: Refactor questionable `thrust::copy_if` unit tests.
+
+## Thrust 1.9.3 (CUDA Toolkit 10.0)
+
+Thrust 1.9.3 unifies and integrates CUDA Thrust and GitHub Thrust.
+
+### Bug Fixes
+
+- #725, #850, #855, #859, #860: Unify the `thrust::iter_swap` interface and fix
+    `thrust::device_reference` swapping.
+- NVBug 2004663: Add a `data` method to `thrust::detail::temporary_array` and
+    refactor temporary memory allocation in the CUDA backend to be exception
+    and leak safe.
+- #886, #894, #914: Various documentation typo fixes.
+- #724: Provide `NVVMIR_LIBRARY_DIR` environment variable to NVCC.
+- #878: Optimize `thrust::min/max_element` to only use
+    `thrust::detail::get_iterator_value` for non-numeric types.
+- #899: Make `thrust::cuda::experimental::pinned_allocator`'s comparison
+    operators `const`.
+- NVBug 2092152: Remove all includes of `<cuda.h>`.
+- #911: Fix default comparator element type for `thrust::merge_by_key`.
+
+### Acknowledgments
+
+- Thanks to Andrew Corrigan for contributing fixes for swapping interfaces.
+- Thanks to Francisco Facioni for contributing optimizations for
+    `thrust::min/max_element`.
+
+## Thrust 1.9.2 (CUDA Toolkit 9.2)
+
+Thrust 1.9.2 brings a variety of performance enhancements, bug fixes and test
+  improvements.
+CUB 1.7.5 was integrated, enhancing the performance of `thrust::sort` on
+  small data types and `thrust::reduce`.
+Changes were applied to `complex` to optimize memory access.
+Thrust now compiles with compiler warnings enabled and treated as errors.
+Additionally, the unit test suite and framework was enhanced to increase
+  coverage.
+
+### Breaking Changes
+
+- The `fallback_allocator` example was removed, as it was buggy and difficult
+    to support.
+
+### New Features
+
+- `<thrust/detail/alignment.h>`, utilities for memory alignment:
+  - `thrust::aligned_reinterpret_cast`.
+  - `thrust::aligned_storage_size`, which computes the amount of storage needed
+      for an object of a particular size and alignment.
+  - `thrust::alignment_of`, a C++03 implementation of C++11's
+      `std::alignment_of`.
+  - `thrust::aligned_storage`, a C++03 implementation of C++11's
+      `std::aligned_storage`.
+  - `thrust::max_align_t`, a C++03 implementation of C++11's
+      `std::max_align_t`.
+
+### Bug Fixes
+
+- NVBug 200385527, NVBug 200385119, NVBug 200385113, NVBug 200349350, NVBug
+    2058778: Various compiler warning issues.
+- NVBug 200355591: `thrust::reduce` performance issues.
+- NVBug 2053727: Fixed an ADL bug that caused user-supplied `allocate` to be
+    overlooked but `deallocate` to be called with GCC <= 4.3.
+- NVBug 1777043: Fixed `thrust::complex` to work with `thrust::sequence`.
+
+## Thrust 1.9.1-2 (CUDA Toolkit 9.1)
+
+Thrust 1.9.1-2 integrates version 1.7.4 of CUB and introduces a new CUDA backend
+  for `thrust::reduce` based on CUB.
+
+### Bug Fixes
+
+- NVBug 1965743: Remove unnecessary static qualifiers.
+- NVBug 1940974: Fix regression causing a compilation error when using
+    `thrust::merge_by_key` with `thrust::constant_iterator`s.
+- NVBug 1904217: Allow callables that take non-const refs to be used with
+    `thrust::reduce` and `thrust::*_scan`.
+
+## Thrust 1.9.0-5 (CUDA Toolkit 9.0)
+
+Thrust 1.9.0-5 replaces the original CUDA backend (bulk) with a new one
+  written using CUB, a high performance CUDA collectives library.
+This brings a substantial performance improvement to the CUDA backend across
+  the board.
+
+### Breaking Changes
+
+- Any code depending on CUDA backend implementation details will likely be
+    broken.
+
+### New Features
+
+- New CUDA backend based on CUB which delivers substantially higher performance.
+- `thrust::transform_output_iterator`, a fancy iterator that applies a function
+    to the output before storing the result.
+
+### New Examples
+
+- `transform_output_iterator` demonstrates use of the new fancy iterator
+    `thrust::transform_output_iterator`.
+
+### Other Enhancements
+
+- When C++11 is enabled, functors do not have to inherit from
+    `thrust::(unary|binary)_function` anymore to be used with
+    `thrust::transform_iterator`.
+- Added C++11 only move constructors and move assignment operators for
+    `thrust::detail::vector_base`-based classes, e.g. `thrust::host_vector`,
+    `thrust::device_vector`, and friends.
+
+### Bug Fixes
+
+- `sin(thrust::complex<double>)` no longer has precision loss to float.
+
+### Acknowledgments
+
+- Thanks to Manuel Schiller for contributing a C++11 based enhancement
+    regarding the deduction of functor return types, improving the performance
+    of `thrust::unique` and implementing `thrust::transform_output_iterator`.
+- Thanks to Thibault Notargiacomo for the implementation of move semantics for
+    the `thrust::vector_base`-based classes.
+- Thanks to Duane Merrill for developing CUB and helping to integrate it into
+    Thrust's backend.
+
+## Thrust 1.8.3 (CUDA Toolkit 8.0)
+
+Thrust 1.8.3 is a small bug fix release.
+
+### New Examples
+
+- `range_view` demonstrates the use of a view (a non-owning wrapper for an
+    iterator range with a container-like interface).
+
+### Bug Fixes
+
+- `thrust::(min|max|minmax)_element` can now accept raw device pointers when
+    an explicit device execution policy is used.
+- `thrust::clear` operations on vector types no longer requires the element
+    type to have a default constructor.
+
+## Thrust 1.8.2 (CUDA Toolkit 7.5)
+
+Thrust 1.8.2 is a small bug fix release.
+
+### Bug Fixes
+
+- Avoid warnings and errors concerning user functions called from
+    `__host__ __device__` functions.
+- #632: Fix an error in `thrust::set_intersection_by_key` with the CUDA backend.
+- #651: `thrust::copy` between host and device now accepts execution policies
+    with streams attached, i.e. `thrust::::cuda::par.on(stream)`.
+- #664: `thrust::for_each` and algorithms based on it no longer ignore streams
+    attached to execution policys.
+
+### Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+## Thrust 1.8.1 (CUDA Toolkit 7.0)
+
+Thrust 1.8.1 is a small bug fix release.
+
+### Bug Fixes
+
+- #615, #620: Fixed `thrust::for_each` and `thrust::reduce` to no longer fail on
+    large inputs.
+
+### Known Issues
+
+- #628: `thrust::reduce_by_key` for the CUDA backend fails for Compute
+    Capability 5.0 devices.
+
+## Thrust 1.8.0
+
+Thrust 1.8.0 introduces support for algorithm invocation from CUDA device
+  code, support for CUDA streams, and algorithm performance improvements.
+Users may now invoke Thrust algorithms from CUDA device code, providing a
+  parallel algorithms library to CUDA programmers authoring custom kernels, as
+  well as allowing Thrust programmers to nest their algorithm calls within
+  functors.
+The `thrust::seq` execution policy allows users to require sequential algorithm
+  execution in the calling thread and makes a sequential algorithms library
+  available to individual CUDA threads.
+The `.on(stream)` syntax allows users to request a CUDA stream for kernels
+  launched during algorithm execution.
+Finally, new CUDA algorithm implementations provide substantial performance
+  improvements.
+
+### New Features
+
+- Algorithms in CUDA Device Code:
+    - Thrust algorithms may now be invoked from CUDA `__device__` and
+        `__host__` __device__ functions.
+      Algorithms invoked in this manner must be invoked with an execution
+        policy as the first parameter.
+      The following execution policies are supported in CUDA __device__ code:
+      - `thrust::seq`
+      - `thrust::cuda::par`
+      - `thrust::device`, when THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA.
+  - Device-side algorithm execution may not be parallelized unless CUDA Dynamic
+      Parallelism is available.
+- Execution Policies:
+  - CUDA Streams
+    - The `thrust::cuda::par.on(stream)` syntax allows users to request that
+        CUDA kernels launched during algorithm execution should occur on a given
+        stream.
+    - Algorithms executed with a CUDA stream in this manner may still
+        synchronize with other streams when allocating temporary storage or
+        returning results to the CPU.
+  - `thrust::seq`, which allows users to require that an algorithm execute
+      sequentially in the calling thread.
+- `thrust::complex`, a complex number data type.
+
+### New Examples
+
+- simple_cuda_streams demonstrates how to request a CUDA stream during
+    algorithm execution.
+- async_reduce demonstrates ways to achieve algorithm invocations which are
+    asynchronous with the calling thread.
+
+### Other Enhancements
+
+- CUDA sort performance for user-defined types is 300% faster on Tesla K20c for
+    large problem sizes.
+- CUDA merge performance is 200% faster on Tesla K20c for large problem sizes.
+- CUDA sort performance for primitive types is 50% faster on Tesla K20c for
+    large problem sizes.
+- CUDA reduce_by_key performance is 25% faster on Tesla K20c for large problem
+    sizes.
+- CUDA scan performance is 15% faster on Tesla K20c for large problem sizes.
+- fallback_allocator example is simpler.
+
+### Bug Fixes
+
+- #364: Iterators with unrelated system tags may be used with algorithms invoked
+    with an execution policy
+- #371: Do not redefine `__CUDA_ARCH__`.
+- #379: Fix crash when dereferencing transform_iterator on the host.
+- #391: Avoid use of uppercase variable names.
+- #392: Fix `thrust::copy` between `cusp::complex` and `std::complex`.
+- #396: Program compiled with gcc < 4.3 hangs during comparison sort.
+- #406: `fallback_allocator.cu` example checks device for unified addressing support.
+- #417: Avoid using `std::less<T>` in binary search algorithms.
+- #418: Avoid various warnings.
+- #443: Including version.h no longer configures default systems.
+- #578: NVCC produces warnings when sequential algorithms are used with CPU systems.
+
+### Known Issues
+
+- When invoked with primitive data types, thrust::sort, thrust::sort_by_key,
+    thrust::stable_sort, & thrust::stable_sort_by_key may
+- Sometimes linking fails when compiling with `-rdc=true` with NVCC.
+- The CUDA implementation of thrust::reduce_by_key incorrectly outputs the last
+    element in a segment of equivalent keys instead of the first.
+
+### Acknowledgments
+
+- Thanks to Sean Baxter for contributing faster CUDA reduce, merge, and scan
+    implementations.
+- Thanks to Duane Merrill for contributing a faster CUDA radix sort implementation.
+- Thanks to Filipe Maia for contributing the implementation of thrust::complex.
+
+## Thrust 1.7.2 (CUDA Toolkit 6.5)
+
+Thrust 1.7.2 is a minor bug fix release.
+
+### Bug Fixes
+
+- Avoid use of `std::min` in generic find implementation.
+
+## Thrust 1.7.1 (CUDA Toolkit 6.0)
+
+Thrust 1.7.1 is a minor bug fix release.
+
+### Bug Fixes
+
+- Eliminate identifiers in `set_operations.cu` example with leading underscore.
+- Eliminate unused variable warning in CUDA `reduce_by_key` implementation.
+- Avoid deriving function objects from `std::unary_function` and
+    `std::binary_function`.
+
+## Thrust 1.7.0 (CUDA Toolkit 5.5)
+
+Thrust 1.7.0 introduces a new interface for controlling algorithm execution as
+  well as several new algorithms and performance improvements.
+With this new interface, users may directly control how algorithms execute as
+  well as details such as the allocation of temporary storage.
+Key/value versions of thrust::merge and the set operation algorithms have been
+  added, as well stencil versions of partitioning algorithms.
+thrust::tabulate has been introduced to tabulate the values of functions taking
+  integers.
+For 32b types, new CUDA merge and set operations provide 2-15x faster
+  performance while a new CUDA comparison sort provides 1.3-4x faster
+  performance.
+Finally, a new TBB reduce_by_key implementation provides 80% faster
+  performance.
+
+### Breaking Changes
+
+- Dispatch:
+  - Custom user backend systems' tag types must now inherit from the
+      corresponding system's execution_policy template (e.g.
+      thrust::cuda::execution_policy) instead of the tag struct (e.g.
+      thrust::cuda::tag). Otherwise, algorithm specializations will silently go
+      unfound during dispatch. See examples/minimal_custom_backend.cu and
+      examples/cuda/fallback_allocator.cu for usage examples.
+  - thrust::advance and thrust::distance are no longer dispatched based on
+      iterator system type and thus may no longer be customized.
+- Iterators:
+  - iterator_facade and iterator_adaptor's Pointer template parameters have
+      been eliminated.
+  - iterator_adaptor has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_adaptor).
+  - iterator_facade has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_facade).
+  - iterator_core_access has been moved into the thrust namespace (previously
+      thrust::experimental::iterator_core_access).
+  - All iterators' nested pointer typedef (the type of the result of
+      operator->) is now void instead of a pointer type to indicate that such
+      expressions are currently impossible.
+  - Floating point counting_iterators' nested difference_type typedef is now a
+      signed integral type instead of a floating point type.
+- Other:
+  - normal_distribution has been moved into the thrust::random namespace
+      (previously thrust::random::experimental::normal_distribution).
+  - Placeholder expressions may no longer include the comma operator.
+
+### New Features
+- Execution Policies:
+  - Users may directly control the dispatch of algorithm invocations with
+      optional execution policy arguments.
+    For example, instead of wrapping raw pointers allocated by cudaMalloc with
+      thrust::device_ptr, the thrust::device execution_policy may be passed as
+      an argument to an algorithm invocation to enable CUDA execution.
+  - The following execution policies are supported in this version:
+    - `thrust::host`
+    - `thrust::device`
+    - `thrust::cpp::par`
+    - `thrust::cuda::par`
+    - `thrust::omp::par`
+    - `thrust::tbb::par`
+- Algorithms:
+  - `thrust::merge_by_key`
+  - `thrust::partition` with stencil
+  - `thrust::partition_copy` with stencil
+  - `thrust::set_difference_by_key`
+  - `thrust::set_intersection_by_key`
+  - `thrust::set_symmetric_difference_by_key`
+  - `thrust::set_union_by_key`
+  - `thrust::stable_partition with stencil`
+  - `thrust::stable_partition_copy with stencil`
+  - `thrust::tabulate`
+- Memory Allocation:
+	- `thrust::malloc`
+	- `thrust::free`
+  - `thrust::get_temporary_buffer`
+  - `thrust::return_temporary_buffer`
+
+### New Examples
+
+- uninitialized_vector demonstrates how to use a custom allocator to avoid the
+    automatic initialization of elements in thrust::device_vector.
+
+### Other Enhancements
+
+- Authors of custom backend systems may manipulate arbitrary state during
+    algorithm dispatch by incorporating it into their execution_policy parameter.
+- Users may control the allocation of temporary storage during algorithm
+    execution by passing standard allocators as parameters via execution policies
+    such as thrust::device.
+- THRUST_DEVICE_SYSTEM_CPP has been added as a compile-time target for the
+    device backend.
+- CUDA merge performance is 2-15x faster.
+- CUDA comparison sort performance is 1.3-4x faster.
+- CUDA set operation performance is 1.5-15x faster.
+- TBB reduce_by_key performance is 80% faster.
+- Several algorithms have been parallelized with TBB.
+- Support for user allocators in vectors has been improved.
+- The sparse_vector example is now implemented with merge_by_key instead of
+    sort_by_key.
+- Warnings have been eliminated in various contexts.
+- Warnings about __host__ or __device__-only functions called from __host__
+    __device__ functions have been eliminated in various contexts.
+- Documentation about algorithm requirements have been improved.
+- Simplified the minimal_custom_backend example.
+- Simplified the cuda/custom_temporary_allocation example.
+- Simplified the cuda/fallback_allocator example.
+
+### Bug Fixes
+
+- #248: Fix broken `thrust::counting_iterator<float>` behavior with OpenMP.
+- #231, #209: Fix set operation failures with CUDA.
+- #187: Fix incorrect occupancy calculation with CUDA.
+- #153: Fix broken multi GPU behavior with CUDA.
+- #142: Eliminate warning produced by `thrust::random::taus88` and MSVC 2010.
+- #208: Correctly initialize elements in temporary storage when necessary.
+- #16: Fix compilation error when sorting bool with CUDA.
+- #10: Fix ambiguous overloads of `thrust::reinterpret_tag`.
+
+### Known Issues
+
+- GCC 4.3 and lower may fail to dispatch thrust::get_temporary_buffer correctly
+    causing infinite recursion in examples such as
+    cuda/custom_temporary_allocation.
+
+### Acknowledgments
+
+- Thanks to Sean Baxter, Bryan Catanzaro, and Manjunath Kudlur for contributing
+    a faster merge implementation for CUDA.
+- Thanks to Sean Baxter for contributing a faster set operation implementation
+    for CUDA.
+- Thanks to Cliff Woolley for contributing a correct occupancy calculation
+    algorithm.
+
+## Thrust 1.6.0
+
+Thrust 1.6.0 provides an interface for customization and extension and a new
+  backend system based on the Threading Building Blocks library.
+With this new interface, programmers may customize the behavior of specific
+  algorithms as well as control the allocation of temporary storage or invent
+  entirely new backends.
+These enhancements also allow multiple different backend systems
+  such as CUDA and OpenMP to coexist within a single program.
+Support for TBB allows Thrust programs to integrate more naturally into
+  applications which may already employ the TBB task scheduler.
+
+### Breaking Changes
+
+- The header <thrust/experimental/cuda/pinned_allocator.h> has been moved to
+    <thrust/system/cuda/experimental/pinned_allocator.h>
+- thrust::experimental::cuda::pinned_allocator has been moved to
+    thrust::cuda::experimental::pinned_allocator
+- The macro THRUST_DEVICE_BACKEND has been renamed THRUST_DEVICE_SYSTEM
+- The macro THRUST_DEVICE_BACKEND_CUDA has been renamed THRUST_DEVICE_SYSTEM_CUDA
+- The macro THRUST_DEVICE_BACKEND_OMP has been renamed THRUST_DEVICE_SYSTEM_OMP
+- thrust::host_space_tag has been renamed thrust::host_system_tag
+- thrust::device_space_tag has been renamed thrust::device_system_tag
+- thrust::any_space_tag has been renamed thrust::any_system_tag
+- thrust::iterator_space has been renamed thrust::iterator_system
+
+### New Features
+
+- Backend Systems
+  - Threading Building Blocks (TBB) is now supported
+- Algorithms
+  - `thrust::for_each_n`
+  - `thrust::raw_reference_cast`
+- Types
+  - `thrust::pointer`
+  - `thrust::reference`
+
+### New Examples
+
+- `cuda/custom_temporary_allocation`
+- `cuda/fallback_allocator`
+- `device_ptr`
+- `expand`
+- `minimal_custom_backend`
+- `raw_reference_cast`
+- `set_operations`
+
+### Other Enhancements
+
+- `thrust::for_each` now returns the end of the input range similar to most
+    other algorithms.
+- `thrust::pair` and `thrust::tuple` have swap functionality.
+- All CUDA algorithms now support large data types.
+- Iterators may be dereferenced in user `__device__` or `__global__` functions.
+- The safe use of different backend systems is now possible within a single
+  binary
+
+### Bug Fixes
+
+- #469 `min_element` and `max_element` algorithms no longer require a const comparison operator
+
+### Known Issues
+
+- NVCC may crash when parsing TBB headers on Windows.
+
+## Thrust 1.5.3 (CUDA Toolkit 5.0)
+
+Thrust 1.5.3 is a minor bug fix release.
+
+### Bug Fixes
+
+- Avoid warnings about potential race due to `__shared__` non-POD variable
+
+## Thrust 1.5.2 (CUDA Toolkit 4.2)
+
+Thrust 1.5.2 is a minor bug fix release.
+
+### Bug Fixes
+
+- Fixed warning about C-style initialization of structures
+
+## Thrust 1.5.1 (CUDA Toolkit 4.1)
+
+Thrust 1.5.1 is a minor bug fix release.
+
+### Bug Fixes
+
+- Sorting data referenced by permutation_iterators on CUDA produces invalid results
+
+## Thrust 1.5.0
+
+Thrust 1.5.0 provides introduces new programmer productivity and performance
+  enhancements.
+New functionality for creating anonymous "lambda" functions has been added.
+A faster host sort provides 2-10x faster performance for sorting arithmetic
+  types on (single-threaded) CPUs.
+A new OpenMP sort provides 2.5x-3.0x speedup over the host sort using a
+  quad-core CPU.
+When sorting arithmetic types with the OpenMP backend the combined performance
+  improvement is 5.9x for 32-bit integers and ranges from 3.0x (64-bit types) to
+  14.2x (8-bit types).
+A new CUDA `reduce_by_key` implementation provides 2-3x faster
+  performance.
+
+### Breaking Changes
+- device_ptr<void> no longer unsafely converts to device_ptr<T> without an
+    explicit cast.
+  Use the expression device_pointer_cast(static_cast<int*>(void_ptr.get())) to
+    convert, for example, device_ptr<void> to device_ptr<int>.
+
+### New Features
+
+- Algorithms:
+  - Stencil-less `thrust::transform_if`.
+- Lambda placeholders
+
+### New Examples
+- lambda
+
+### Other Enhancements
+
+- Host sort is 2-10x faster for arithmetic types
+- OMP sort provides speedup over host sort
+- `reduce_by_key` is 2-3x faster
+- `reduce_by_key` no longer requires O(N) temporary storage
+- CUDA scan algorithms are 10-40% faster
+- `host_vector` and `device_vector` are now documented
+- out-of-memory exceptions now provide detailed information from CUDART
+- improved histogram example
+- `device_reference` now has a specialized swap
+- `reduce_by_key` and scan algorithms are compatible with `discard_iterator`
+
+### Bug Fixes
+
+- #44: Allow `thrust::host_vector` to compile when `value_type` uses
+    `__align__`.
+- #198: Allow `thrust::adjacent_difference` to permit safe in-situ operation.
+- #303: Make thrust thread-safe.
+- #313: Avoid race conditions in `thrust::device_vector::insert`.
+- #314: Avoid unintended ADL invocation when dispatching copy.
+- #365: Fix merge and set operation failures.
+
+### Known Issues
+
+- None
+
+### Acknowledgments
+
+- Thanks to Manjunath Kudlur for contributing his Carbon library, from which
+    the lambda functionality is derived.
+- Thanks to Jean-Francois Bastien for suggesting a fix for #303.
+
+## Thrust 1.4.0 (CUDA Toolkit 4.0)
+
+Thrust 1.4.0 is the first release of Thrust to be included in the CUDA Toolkit.
+Additionally, it brings many feature and performance improvements.
+New set theoretic algorithms operating on sorted sequences have been added.
+Additionally, a new fancy iterator allows discarding redundant or otherwise
+  unnecessary output from algorithms, conserving memory storage and bandwidth.
+
+### Breaking Changes
+
+- Eliminations
+  - `thrust/is_sorted.h`
+  - `thrust/utility.h`
+  - `thrust/set_intersection.h`
+  - `thrust/experimental/cuda/ogl_interop_allocator.h` and the functionality
+      therein
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::deprecated::copy_when`
+  - `thrust::deprecated::absolute_value`
+  - `thrust::gather` and `thrust::scatter` from host to device and vice versa
+      are no longer supported.
+  - Operations which modify the elements of a thrust::device_vector are no longer
+      available from source code compiled without nvcc when the device backend
+      is CUDA.
+    Instead, use the idiom from the cpp_interop example.
+
+### New Features
+
+- Algorithms:
+  - `thrust::copy_n`
+  - `thrust::merge`
+  - `thrust::set_difference`
+  - `thrust::set_symmetric_difference`
+  - `thrust::set_union`
+
+- Types
+  - `thrust::discard_iterator`
+
+- Device Support:
+  - Compute Capability 2.1 GPUs.
+
+### New Examples
+
+- run_length_decoding
+
+### Other Enhancements
+
+- Compilation warnings are substantially reduced in various contexts.
+- The compilation time of thrust::sort, thrust::stable_sort,
+    thrust::sort_by_key, and thrust::stable_sort_by_key are substantially
+    reduced.
+- A fast sort implementation is used when sorting primitive types with
+    thrust::greater.
+- The performance of thrust::set_intersection is improved.
+- The performance of thrust::fill is improved on SM 1.x devices.
+- A code example is now provided in each algorithm's documentation.
+- thrust::reverse now operates in-place
+
+### Bug Fixes
+
+- #212: `thrust::set_intersection` works correctly for large input sizes.
+- #275: `thrust::counting_iterator` and `thrust::constant_iterator` work
+    correctly with OpenMP as the backend when compiling with optimization.
+- #256: `min` and `max` correctly return their first argument as a tie-breaker
+- #248: `NDEBUG` is interpreted incorrectly
+
+### Known Issues
+
+- NVCC may generate code containing warnings when compiling some Thrust
+    algorithms.
+- When compiling with `-arch=sm_1x`, some Thrust algorithms may cause NVCC to
+    issue benign pointer advisories.
+- When compiling with `-arch=sm_1x` and -G, some Thrust algorithms may fail to
+    execute correctly.
+- `thrust::inclusive_scan`, `thrust::exclusive_scan`,
+    `thrust::inclusive_scan_by_key`, and `thrust::exclusive_scan_by_key` are
+    currently incompatible with `thrust::discard_iterator`.
+
+### Acknowledgments
+
+- Thanks to David Tarjan for improving the performance of set_intersection.
+- Thanks to Duane Merrill for continued help with sort.
+- Thanks to Nathan Whitehead for help with CUDA Toolkit integration.
+
+## Thrust 1.3.0
+
+Thrust 1.3.0 provides support for CUDA Toolkit 3.2 in addition to many feature
+  and performance enhancements.
+Performance of the sort and sort_by_key algorithms is improved by as much as 3x
+  in certain situations.
+The performance of stream compaction algorithms, such as copy_if, is improved
+  by as much as 2x.
+CUDA errors are now converted to runtime exceptions using the system_error
+  interface.
+Combined with a debug mode, also new in 1.3, runtime errors can be located with
+  greater precision.
+Lastly, a few header files have been consolidated or renamed for clarity.
+See the deprecations section below for additional details.
+
+### Breaking Changes
+
+- Promotions
+  - thrust::experimental::inclusive_segmented_scan has been renamed
+      thrust::inclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::exclusive_segmented_scan has been renamed
+      thrust::exclusive_scan_by_key and exposes a different interface
+  - thrust::experimental::partition_copy has been renamed
+      thrust::partition_copy and exposes a different interface
+  - thrust::next::gather has been renamed thrust::gather
+  - thrust::next::gather_if has been renamed thrust::gather_if
+  - thrust::unique_copy_by_key has been renamed thrust::unique_by_key_copy
+- Deprecations
+  - thrust::copy_when has been renamed thrust::deprecated::copy_when
+  - thrust::absolute_value has been renamed thrust::deprecated::absolute_value
+  - The header thrust/set_intersection.h is now deprecated; use
+      thrust/set_operations.h instead
+  - The header thrust/utility.h is now deprecated; use thrust/swap.h instead
+  - The header thrust/swap_ranges.h is now deprecated; use thrust/swap.h instead
+- Eliminations
+  - thrust::deprecated::gather
+  - thrust::deprecated::gather_if
+  - thrust/experimental/arch.h and the functions therein
+  - thrust/sorting/merge_sort.h
+  - thrust/sorting/radix_sort.h
+- NVCC 2.3 is no longer supported
+
+### New Features
+
+- Algorithms:
+  - `thrust::exclusive_scan_by_key`
+  - `thrust::find`
+  - `thrust::find_if`
+  - `thrust::find_if_not`
+  - `thrust::inclusive_scan_by_key`
+  - `thrust::is_partitioned`
+  - `thrust::is_sorted_until`
+  - `thrust::mismatch`
+  - `thrust::partition_point`
+  - `thrust::reverse`
+  - `thrust::reverse_copy`
+  - `thrust::stable_partition_copy`
+
+- Types:
+  - `thrust::system_error` and related types.
+  - `thrust::experimental::cuda::ogl_interop_allocator`.
+  - `thrust::bit_and`, `thrust::bit_or`, and `thrust::bit_xor`.
+
+- Device Support:
+  - GF104-based GPUs.
+
+### New Examples
+
+- opengl_interop.cu
+- repeated_range.cu
+- simple_moving_average.cu
+- sparse_vector.cu
+- strided_range.cu
+
+### Other Enhancements
+
+- Performance of thrust::sort and thrust::sort_by_key is substantially improved
+    for primitive key types
+- Performance of thrust::copy_if is substantially improved
+- Performance of thrust::reduce and related reductions is improved
+- THRUST_DEBUG mode added
+- Callers of Thrust functions may detect error conditions by catching
+    thrust::system_error, which derives from std::runtime_error
+- The number of compiler warnings generated by Thrust has been substantially
+    reduced
+- Comparison sort now works correctly for input sizes > 32M
+- min & max usage no longer collides with <windows.h> definitions
+- Compiling against the OpenMP backend no longer requires nvcc
+- Performance of device_vector initialized in .cpp files is substantially
+    improved in common cases
+- Performance of thrust::sort_by_key on the host is substantially improved
+
+### Bug Fixes
+
+- Debug device code now compiles correctly
+- thrust::uninitialized_copy and thrust::uninitialized_fill now dispatch
+    constructors on the device rather than the host
+
+### Known Issues
+
+- #212 set_intersection is known to fail for large input sizes
+- partition_point is known to fail for 64b types with nvcc 3.2
+
+Acknowledgments
+- Thanks to Duane Merrill for contributing a fast CUDA radix sort implementation
+- Thanks to Erich Elsen for contributing an implementation of find_if
+- Thanks to Andrew Corrigan for contributing changes which allow the OpenMP
+    backend to compile in the absence of nvcc
+- Thanks to Andrew Corrigan, Cliff Wooley, David Coeurjolly, Janick Martinez
+    Esturo, John Bowers, Maxim Naumov, Michael Garland, and Ryuta Suzuki for
+    bug reports
+- Thanks to Cliff Woolley for help with testing
+
+## Thrust 1.2.1
+
+Thrust 1.2.1 is a small bug fix release that is compatible with the CUDA
+  Toolkit 3.1 release.
+
+### Known Issues
+
+- `thrust::inclusive_scan` and `thrust::exclusive_scan` may fail with very
+    large types.
+- MSVC may fail to compile code using both sort and binary search algorithms.
+- `thrust::uninitialized_fill` and `thrust::uninitialized_copy` dispatch
+    constructors on the host rather than the device.
+- #109: Some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads.
+- `thrust::default_random_engine::discard` is not accelerated with NVCC 2.3
+- NVCC 3.1 may fail to compile code using types derived from
+    `thrust::subtract_with_carry_engine`, such as `thrust::ranlux24` and
+    `thrust::ranlux48`.
+
+## Thrust 1.2.0
+
+Thrust 1.2.0 introduces support for compilation to multicore CPUs and the Ocelot
+  virtual machine, and several new facilities for pseudo-random number
+  generation.
+New algorithms such as set intersection and segmented reduction have also been
+  added.
+Lastly, improvements to the robustness of the CUDA backend ensure correctness
+  across a broad set of (uncommon) use cases.
+
+### Breaking Changes
+
+- `thrust::gather`'s interface was incorrect and has been removed.
+  The old interface is deprecated but will be preserved for Thrust version 1.2
+    at `thrust::deprecated::gather` and `thrust::deprecated::gather_if`.
+  The new interface is provided at `thrust::next::gather` and
+    `thrust::next::gather_if`.
+  The new interface will be promoted to `thrust::` in Thrust version 1.3.
+  For more details, please refer to [this thread](http://groups.google.com/group/thrust-users/browse_thread/thread/f5f0583cb97b51fd).
+- The `thrust::sorting` namespace has been deprecated in favor of the top-level
+    sorting functions, such as `thrust::sort` and `thrust::sort_by_key`.
+- Removed support for `thrust::equal` between host & device sequences.
+- Removed support for `thrust::scatter` between host & device sequences.
+
+### New Features
+
+- Algorithms:
+  - `thrust::reduce_by_key`
+  - `thrust::set_intersection`
+  - `thrust::unique_copy`
+  - `thrust::unique_by_key`
+  - `thrust::unique_copy_by_key`
+- Types
+- Random Number Generation:
+  - `thrust::discard_block_engine`
+  - `thrust::default_random_engine`
+  - `thrust::linear_congruential_engine`
+  - `thrust::linear_feedback_shift_engine`
+  - `thrust::subtract_with_carry_engine`
+  - `thrust::xor_combine_engine`
+  - `thrust::minstd_rand`
+  - `thrust::minstd_rand0`
+  - `thrust::ranlux24`
+  - `thrust::ranlux48`
+  - `thrust::ranlux24_base`
+  - `thrust::ranlux48_base`
+  - `thrust::taus88`
+  - `thrust::uniform_int_distribution`
+  - `thrust::uniform_real_distribution`
+  - `thrust::normal_distribution` (experimental)
+- Function Objects:
+  - `thrust::project1st`
+  - `thrust::project2nd`
+- `thrust::tie`
+- Fancy Iterators:
+  - `thrust::permutation_iterator`
+  - `thrust::reverse_iterator`
+- Vector Functions:
+  - `operator!=`
+  - `rbegin`
+  - `crbegin`
+  - `rend`
+  - `crend`
+  - `data`
+  - `shrink_to_fit`
+- Device Support:
+  - Multicore CPUs via OpenMP.
+  - Fermi-class GPUs.
+  - Ocelot virtual machines.
+- Support for NVCC 3.0.
+
+### New Examples
+
+- `cpp_integration`
+- `histogram`
+- `mode`
+- `monte_carlo`
+- `monte_carlo_disjoint_sequences`
+- `padded_grid_reduction`
+- `permutation_iterator`
+- `row_sum`
+- `run_length_encoding`
+- `segmented_scan`
+- `stream_compaction`
+- `summary_statistics`
+- `transform_iterator`
+- `word_count`
+
+### Other Enhancements
+
+- Integer sorting performance is improved when max is large but (max - min) is
+    small and when min is negative
+- Performance of `thrust::inclusive_scan` and `thrust::exclusive_scan` is
+    improved by 20-25% for primitive types.
+
+### Bug Fixes
+
+- #8 cause a compiler error if the required compiler is not found rather than a
+    mysterious error at link time
+- #42 device_ptr & device_reference are classes rather than structs,
+    eliminating warnings on certain platforms
+- #46 gather & scatter handle any space iterators correctly
+- #51 thrust::experimental::arch functions gracefully handle unrecognized GPUs
+- #52 avoid collisions with common user macros such as BLOCK_SIZE
+- #62 provide better documentation for device_reference
+- #68 allow built-in CUDA vector types to work with device_vector in pure C++
+    mode
+- #102 eliminated a race condition in device_vector::erase
+- various compilation warnings eliminated
+
+### Known Issues
+
+- inclusive_scan & exclusive_scan may fail with very large types
+- MSVC may fail to compile code using both sort and binary search algorithms
+- uninitialized_fill & uninitialized_copy dispatch constructors on the host
+    rather than the device
+- #109 some algorithms may exhibit poor performance with the OpenMP backend
+    with large numbers (>= 6) of CPU threads
+- default_random_engine::discard is not accelerated with nvcc 2.3
+
+### Acknowledgments
+
+- Thanks to Gregory Diamos for contributing a CUDA implementation of
+    set_intersection
+- Thanks to Ryuta Suzuki & Gregory Diamos for rigorously testing Thrust's unit
+    tests and examples against Ocelot
+- Thanks to Tom Bradley for contributing an implementation of normal_distribution
+- Thanks to Joseph Rhoads for contributing the example summary_statistics
+
+## Thrust 1.1.1
+
+Thrust 1.1.1 is a small bug fix release that is compatible with the CUDA
+  Toolkit 2.3a release and Mac OSX Snow Leopard.
+
+## Thrust 1.1.0
+
+Thrust 1.1.0 introduces fancy iterators, binary search functions, and several
+  specialized reduction functions.
+Experimental support for segmented scans has also been added.
+
+### Breaking Changes
+
+- `thrust::counting_iterator` has been moved into the `thrust` namespace
+    (previously `thrust::experimental`).
+
+### New Features
+
+- Algorithms:
+  - `thrust::copy_if`
+  - `thrust::lower_bound`
+  - `thrust::upper_bound`
+  - `thrust::vectorized lower_bound`
+  - `thrust::vectorized upper_bound`
+  - `thrust::equal_range`
+  - `thrust::binary_search`
+  - `thrust::vectorized binary_search`
+  - `thrust::all_of`
+  - `thrust::any_of`
+  - `thrust::none_of`
+  - `thrust::minmax_element`
+  - `thrust::advance`
+  - `thrust::inclusive_segmented_scan` (experimental)
+  - `thrust::exclusive_segmented_scan` (experimental)
+- Types:
+  - `thrust::pair`
+  - `thrust::tuple`
+  - `thrust::device_malloc_allocator`
+- Fancy Iterators:
+  - `thrust::constant_iterator`
+  - `thrust::counting_iterator`
+  - `thrust::transform_iterator`
+  - `thrust::zip_iterator`
+
+### New Examples
+
+- Computing the maximum absolute difference between vectors.
+- Computing the bounding box of a two-dimensional point set.
+- Sorting multiple arrays together (lexicographical sorting).
+- Constructing a summed area table.
+- Using `thrust::zip_iterator` to mimic an array of structs.
+- Using `thrust::constant_iterator` to increment array values.
+
+### Other Enhancements
+
+- Added pinned memory allocator (experimental).
+- Added more methods to host_vector & device_vector (issue #4).
+- Added variant of remove_if with a stencil argument (issue #29).
+- Scan and reduce use cudaFuncGetAttributes to determine grid size.
+- Exceptions are reported when temporary device arrays cannot be allocated.
+
+### Bug Fixes
+
+- #5: Make vector work for larger data types
+- #9: stable_partition_copy doesn't respect OutputIterator concept semantics
+- #10: scans should return OutputIterator
+- #16: make algorithms work for larger data types
+- #27: Dispatch radix_sort even when comp=less<T> is explicitly provided
+
+### Known Issues
+
+- Using functors with Thrust entry points may not compile on Mac OSX with gcc
+    4.0.1.
+- `thrust::uninitialized_copy` and `thrust::uninitialized_fill` dispatch
+    constructors on the host rather than the device.
+- `thrust::inclusive_scan`, `thrust::inclusive_scan_by_key`,
+    `thrust::exclusive_scan`, and `thrust::exclusive_scan_by_key` may fail when
+    used with large types with the CUDA Toolkit 3.1.
+
+## Thrust 1.0.0
+
+First production release of Thrust.
+
+### Breaking Changes
+
+- Rename top level namespace `komrade` to `thrust`.
+- Move `thrust::partition_copy` & `thrust::stable_partition_copy` into
+    `thrust::experimental` namespace until we can easily provide the standard
+    interface.
+- Rename `thrust::range` to `thrust::sequence` to avoid collision with
+    Boost.Range.
+- Rename `thrust::copy_if` to `thrust::copy_when` due to semantic differences
+    with C++0x `std::copy_if`.
+
+### New Features
+
+- Add C++0x style `cbegin` & `cend` methods to `thrust::host_vector` and
+    `thrust::device_vector`.
+- Add `thrust::transform_if` function.
+- Add stencil versions of `thrust::replace_if` & `thrust::replace_copy_if`.
+- Allow `counting_iterator` to work with `thrust::for_each`.
+- Allow types with constructors in comparison `thrust::sort` and
+    `thrust::reduce`.
+
+### Other Enhancements
+
+- `thrust::merge_sort` and `thrust::stable_merge_sort` are now 2x to 5x faster
+    when executed on the parallel device.
+
+### Bug Fixes
+
+- Komrade 6: Workaround an issue where an incremented iterator causes NVCC to
+    crash.
+- Komrade 7: Fix an issue where `const_iterator`s could not be passed to
+    `thrust::transform`.
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..967ebf53a
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,155 @@
+# 3.15 is the minimum for including the project with add_subdirectory.
+# 3.17 for building the project's standalone tests/examples/etc.
+# 3.18.3 for C++17 + CUDA
+cmake_minimum_required(VERSION 3.15)
+
+# Remove this when we use the new CUDA_ARCHITECTURES properties with both
+# nvcc and nvc++.
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(Thrust NONE)
+
+# Determine whether Thrust is the top-level project or included into
+# another project via add_subdirectory()
+if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_LIST_DIR}")
+  set(THRUST_TOPLEVEL_PROJECT ON)
+else()
+  set(THRUST_TOPLEVEL_PROJECT OFF)
+endif()
+
+## thrust_fix_clang_nvcc_build_for 
+#
+# Modifies the given target to include a fix for the clang host compiler case.
+# The fix consists of force-including a header into each compilation unit.
+#
+function(thrust_fix_clang_nvcc_build_for target)
+  if (UNIX)
+    # Path to the header containing the fix for clang + nvcc < 11.6. For more info,
+    # check the content of this header.
+    set(clang_fix_header_path "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/testing/fix_clang_nvcc_11.5.h")
+
+    # Only affects host compiler
+    target_compile_options(${target} PRIVATE 
+        "$<$<COMPILE_LANGUAGE:CUDA>:-include${clang_fix_header_path}>")
+  endif()
+endfunction()
+
+# This must be done before any languages are enabled:
+if (THRUST_TOPLEVEL_PROJECT)
+  include(cmake/ThrustCompilerHacks.cmake)
+endif()
+
+# This must appear after our Compiler Hacks or else CMake will delete the cache
+# and reconfigure from scratch.
+# This must also appear before the installation rules, as it is required by the
+# GNUInstallDirs CMake module.
+enable_language(CXX)
+
+# Optionally include installation rules for non-top-level builds:
+option(THRUST_ENABLE_INSTALL_RULES "Enable installation of Thrust" ${THRUST_TOPLEVEL_PROJECT})
+if (THRUST_ENABLE_INSTALL_RULES)
+  include(cmake/ThrustInstallRules.cmake)
+endif()
+
+# Support adding Thrust to a parent project via add_subdirectory.
+# See examples/cmake/add_subdir/CMakeLists.txt for details.
+if (NOT THRUST_TOPLEVEL_PROJECT)
+  include(cmake/ThrustAddSubdir.cmake)
+  return()
+endif()
+
+# We use 3.17 features when building our tests, etc.
+cmake_minimum_required(VERSION 3.17)
+
+option(THRUST_ENABLE_HEADER_TESTING "Test that all public headers compile." "ON")
+option(THRUST_ENABLE_TESTING "Build Thrust testing suite." "ON")
+option(THRUST_ENABLE_EXAMPLES "Build Thrust examples." "ON")
+option(THRUST_ENABLE_BENCHMARKS "Build Thrust runtime benchmarks." "OFF")
+option(THRUST_INCLUDE_CUB_CMAKE "Build CUB tests and examples. (Requires CUDA)." "OFF")
+
+# Mark this option as advanced for now. We'll revisit this later once the new
+# benchmarks are ready. For now, we just need to expose a way to compile
+# bench.cu from CMake for NVIDIA's internal builds.
+mark_as_advanced(THRUST_ENABLE_BENCHMARKS)
+
+# Check if we're actually building anything before continuing. If not, no need
+# to search for deps, etc. This is a common approach for packagers that just
+# need the install rules. See GH issue NVIDIA/thrust#1211.
+if (NOT (THRUST_ENABLE_HEADER_TESTING OR
+         THRUST_ENABLE_TESTING OR
+         THRUST_ENABLE_EXAMPLES OR
+         THRUST_ENABLE_BENCHMARKS OR
+         THRUST_INCLUDE_CUB_CMAKE))
+  return()
+endif()
+
+include(cmake/AppendOptionIfAvailable.cmake)
+include(cmake/ThrustBuildCompilerTargets.cmake)
+include(cmake/ThrustBuildTargetList.cmake)
+include(cmake/ThrustFindThrust.cmake)
+include(cmake/ThrustMultiConfig.cmake)
+include(cmake/ThrustUtilities.cmake)
+
+# Add cache string options for CMAKE_BUILD_TYPE and default to RelWithDebInfo.
+if ("" STREQUAL "${CMAKE_BUILD_TYPE}")
+  set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build." FORCE)
+
+  set_property(
+    CACHE CMAKE_BUILD_TYPE
+    PROPERTY STRINGS Debug Release RelWithDebInfo MinSizeRel
+  )
+endif ()
+
+# Disable compiler extensions:
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Where to put build outputs. Use CMAKE_BINARY_DIR so they'll show up in the
+# top-level project's dir when building Thrust via add_subdirectory.
+set(THRUST_LIBRARY_OUTPUT_DIR "${CMAKE_BINARY_DIR}/lib")
+set(THRUST_EXECUTABLE_OUTPUT_DIR "${CMAKE_BINARY_DIR}/bin")
+
+thrust_configure_multiconfig()
+thrust_find_thrust()
+thrust_build_compiler_targets()
+thrust_update_system_found_flags()
+if (THRUST_CUDA_FOUND)
+  include(cmake/ThrustCudaConfig.cmake)
+endif()
+thrust_build_target_list()
+
+message(STATUS "CPP system found?  ${THRUST_CPP_FOUND}")
+message(STATUS "CUDA system found? ${THRUST_CUDA_FOUND}")
+message(STATUS "TBB system found?  ${THRUST_TBB_FOUND}")
+message(STATUS "OMP system found?  ${THRUST_OMP_FOUND}")
+
+if (THRUST_ENABLE_HEADER_TESTING)
+  include(cmake/ThrustHeaderTesting.cmake)
+endif()
+
+# Both testing and examples use ctest
+if (THRUST_ENABLE_TESTING OR THRUST_ENABLE_EXAMPLES)
+  include(CTest)
+  enable_testing()
+endif()
+
+if (THRUST_ENABLE_TESTING)
+  add_subdirectory(testing)
+endif()
+
+if (THRUST_ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif()
+
+if (THRUST_ENABLE_BENCHMARKS)
+  add_subdirectory(internal/benchmark)
+endif()
+
+if (THRUST_INCLUDE_CUB_CMAKE AND THRUST_CUDA_FOUND)
+  set(CUB_IN_THRUST ON)
+  # CUB's path is specified generically to support both GitHub and Perforce
+  # source tree layouts. The include directory used by cub-config.cmake
+  # for source layouts is the same as the project root.
+  add_subdirectory("${_CUB_INCLUDE_DIR}" dependencies/cub)
+endif()
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..8c56af363
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,96 @@
+# Code of Conduct
+
+## Overview
+
+This document defines the Code of Conduct followed and enforced for NVIDIA C++
+  Core Compute Libraries.
+
+### Intended Audience
+
+* Community
+* Developers
+* Project Leads
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+  contributors and maintainers pledge to making participation in our project and
+  our community a harassment-free experience for everyone, regardless of age,
+  body size, disability, ethnicity, sex characteristics, gender identity and
+  expression, level of experience, education, socio-economic status, nationality,
+  personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+- Using welcoming and inclusive language.
+- Being respectful of differing viewpoints and experiences.
+- Gracefully accepting constructive criticism.
+- Focusing on what is best for the community.
+- Showing empathy towards other community members.
+
+Examples of unacceptable behavior by participants include:
+
+- The use of sexualized language or imagery and unwelcome sexual attention or
+    advances.
+- Trolling, insulting/derogatory comments, and personal or political attacks.
+- Public or private harassment.
+- Publishing others’ private information, such as a physical or electronic
+    address, without explicit permission.
+- Other conduct which could reasonably be considered inappropriate.
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+  behavior and are expected to take appropriate and fair corrective action in
+  response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+  reject comments, commits, code, wiki edits, issues, and other contributions
+  that are not aligned to this Code of Conduct, or to ban temporarily or
+  permanently any contributor for other behaviors that they deem inappropriate,
+  threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+  when an individual is representing the project or its community.
+Examples of representing a project or community include using an official
+  project email address, posting via an official social media account, or acting
+  as an appointed representative at an online or offline event.
+Representation of a project may be further defined and clarified by project
+  maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+  reported by contacting [cpp-conduct@nvidia.com](mailto:cpp-conduct@nvidia.com).
+All complaints will be reviewed and investigated and will result in a response
+  that is deemed necessary and appropriate to the circumstances.
+The project team is obligated to maintain confidentiality with regard to the
+  reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+  faith may face temporary or permanent repercussions as determined by other
+  members of the project’s leadership.
+
+## Attribution
+
+This Code of Conduct was taken from the [NVIDIA RAPIDS] project, which was
+  adapted from the [Contributor Covenant version 1.4].
+
+Please see this [FAQ] for answers to common questions about this Code of Conduct.
+
+## Contact
+
+Please email [cpp-conduct@nvidia.com] for any Code of Conduct related matters.
+
+
+[cpp-conduct@nvidia.com]: mailto:cpp-conduct@nvidia.com
+
+[FAQ]: https://www.contributor-covenant.org/faq
+
+[NVIDIA RAPIDS]: https://docs.rapids.ai/resources/conduct/
+[Contributor Covenant version 1.4]: https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
diff --git a/LICENSE b/LICENSE
index e454a5258..c22c22563 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,7 @@
+Unless otherwise noted, Thrust's source code is released under the Apache
+License, Version 2.0:
+
+================================================================================
 
                                  Apache License
                            Version 2.0, January 2004
@@ -174,5 +178,72 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
-
+================================================================================
+
+Some portions of Thrust may be licensed under other compatible open-source
+licenses. Any divergence from the Apache 2 license will be noted in the source
+code where applicable.
+
+Portions under other terms include, but are not limited to:
+
+================================================================================
+
+Various C++ utility classes in Thrust are based on the Boost Iterator, Tuple,
+System, and Random Number libraries, which are provided under the Boost Software
+License:
+
+    Boost Software License - Version 1.0 - August 17th, 2003
+
+    Permission is hereby granted, free of charge, to any person or organization
+    obtaining a copy of the software and accompanying documentation covered by
+    this license (the "Software") to use, reproduce, display, distribute,
+    execute, and transmit the Software, and to prepare derivative works of the
+    Software, and to permit third-parties to whom the Software is furnished to
+    do so, all subject to the following:
+
+    The copyright notices in the Software and this entire statement, including
+    the above license grant, this restriction and the following disclaimer,
+    must be included in all copies of the Software, in whole or in part, and
+    all derivative works of the Software, unless such copies or derivative
+    works are solely in the form of machine-executable object code generated by
+    a source language processor.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+    DEALINGS IN THE SOFTWARE.
+
+================================================================================
+
+Portions of the thrust::complex implementation are derived from FreeBSD with the
+following terms:
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+       notice[1] unmodified, this list of conditions, and the following
+       disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+    IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+    OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+    IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+    NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+    DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+    THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+    THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+[1] Individual copyright notices from the original authors are included in
+    the relevant source files.
+
+================================================================================
diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..4b5a4a423
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,161 @@
+# Copyright 2010-2020 NVIDIA Corporation.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#		http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Makefile for building Thrust unit test driver
+
+# Force C++11 mode. NVCC will ignore it if the host compiler doesn't support it.
+export CXX_STD := c++11
+
+export CCCL_ENABLE_DEPRECATIONS := 1
+
+export VERBOSE := 1
+
+ifndef PROFILE
+  ifdef VULCAN_TOOLKIT_BASE
+    include $(VULCAN_TOOLKIT_BASE)/build/getprofile.mk
+    include $(VULCAN_TOOLKIT_BASE)/build/config/$(PROFILE).mk
+  else
+    include ../build/getprofile.mk
+    include ../build/config/$(PROFILE).mk
+  endif
+endif
+
+SOLNDIR := .
+
+ifdef VULCAN_TOOLKIT_BASE
+  include $(VULCAN_TOOLKIT_BASE)/build/config/DetectOS.mk
+else
+  include ../build/config/DetectOS.mk
+endif
+
+TMP_DIR      := built
+TMP_PREFIX   := $(ROOTDIR)
+TMP_ARCH     := $(ARCH)_$(PROFILE)_agnostic
+THRUST_MKDIR := $(TMP_PREFIX)/$(TMP_DIR)/$(TMP_ARCH)/thrust/mk
+THRUST_DIR   := $(ROOTDIR)/thrust
+
+res:=$(shell $(PYTHON) ./generate_mk.py $(THRUST_MKDIR) $(THRUST_DIR))
+
+# Use these environment variables to control what gets built:
+#
+#   TEST_ALL
+#   TEST_UNITTESTS
+#   TEST_EXAMPLES
+#   TEST_BENCH
+#   TEST_OTHER
+
+ifneq ($(TEST_ALL),)
+  override TEST_UNITTESTS := 1
+  override TEST_EXAMPLES := 1
+  override TEST_BENCH := 1
+  override TEST_OTHER := 1
+endif
+
+ifeq ($(TEST_UNITTESTS)$(TEST_EXAMPLES)$(TEST_BENCH)$(TEST_OTHER),)
+  override TEST_UNITTESTS := 1
+  override TEST_EXAMPLES := 1
+  override TEST_BENCH := 1
+  override TEST_OTHER := 1
+endif
+
+ifneq ($(TEST_OTHER),)
+  PROJECTS += internal/build/warningstester
+endif
+
+ifneq ($(TEST_BENCH),)
+  PROJECTS += internal/benchmark/bench
+endif
+
+ifneq ($(TEST_UNITTESTS),)
+  # copy existing projects
+  PROJECTS_COPY := $(PROJECTS)
+
+  # empty PROJECTS
+  PROJECTS :=
+
+  # populate PROJECTS with unit tests.
+  include $(THRUST_MKDIR)/testing.mk
+
+  # Once PROJECTS is populated with unit tests, re-add the previous projects.
+  PROJECTS += $(PROJECTS_COPY)
+endif
+
+ifneq ($(TEST_EXAMPLES),)
+  # Copy existing projects.
+  PROJECTS_COPY := $(PROJECTS)
+
+  # Empty PROJECTS.
+  PROJECTS :=
+
+  # Populate PROJECTS with examples.
+  include $(THRUST_MKDIR)/examples.mk
+
+  # Once PROJECTS is populated with examples, re-add the previous projects.
+  PROJECTS += $(PROJECTS_COPY)
+endif
+
+ifdef VULCAN_TOOLKIT_BASE
+  include $(VULCAN_TOOLKIT_BASE)/build/common.mk
+else
+  include ../build/common.mk
+endif
+
+ifeq ($(OS), win32)
+  CREATE_DVS_PACKAGE = $(ZIP) -r built/CUDA-thrust-package.zip bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES)
+  APPEND_H_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.h
+  APPEND_INL_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.inl
+  APPEND_CUH_DVS_PACKAGE = $(ZIP) -rg built/CUDA-thrust-package.zip thrust -9 -i *.cuh
+  MAKE_DVS_PACKAGE = $(CREATE_DVS_PACKAGE) && $(APPEND_H_DVS_PACKAGE) && $(APPEND_INL_DVS_PACKAGE) && $(APPEND_CUH_DVS_PACKAGE)
+else
+  TAR_FILES = bin thrust/internal/test thrust/internal/scripts thrust/internal/benchmark $(DVS_COMMON_TEST_PACKAGE_FILES)
+  TAR_FILES += `find -L thrust \( -name "*.cuh" -o -name "*.h" -o -name "*.inl" \)`
+  MAKE_DVS_PACKAGE = tar -I bzip2 -chvf built/CUDA-thrust-package.tar.bz2 $(TAR_FILES)
+endif
+
+COPY_CUB_FOR_PACKAGING = rm -rf cub && cp -rp ../cub/cub cub
+
+DVS_OPTIONS :=
+
+ifneq ($(TARGET_ARCH),$(HOST_ARCH))
+  DVS_OPTIONS += TARGET_ARCH=$(TARGET_ARCH)
+endif
+ifeq ($(TARGET_ARCH),ARMv7)
+  DVS_OPTIONS += ABITYPE=$(ABITYPE)
+endif
+
+THRUST_DVS_BUILD = release
+
+pack:
+	$(COPY_CUB_FOR_PACKAGING)
+	cd .. && $(MAKE_DVS_PACKAGE)
+
+dvs:
+	$(COPY_CUB_FOR_PACKAGING)
+# Build the CUDA Runtime in GVS, because GVS has no CUDA Runtime component.
+# This is a temporary workaround until the Tegra team adds a CUDA Runtime
+# component, which they have promised to do.
+ifdef GVS
+	$(MAKE) $(DVS_OPTIONS) -s -C ../cuda $(THRUST_DVS_BUILD)
+endif
+	$(MAKE) $(DVS_OPTIONS) $(THRUST_DVS_BUILD) THRUST_DVS=1
+	cd .. && $(MAKE_DVS_PACKAGE)
+
+dvs_release:
+	$(MAKE) dvs THRUST_DVS_BUILD=release
+
+dvs_debug:
+	$(MAKE) dvs THRUST_DVS_BUILD=debug
+
+include $(THRUST_MKDIR)/dependencies.mk
+
diff --git a/NOTICE b/NOTICE
deleted file mode 100644
index 6209bb423..000000000
--- a/NOTICE
+++ /dev/null
@@ -1,26 +0,0 @@
-Thrust includes soruce code from the Boost Iterator, Tuple, System, and Random Number libraries.
-
-    Boost Software License - Version 1.0 - August 17th, 2003
-    
-    Permission is hereby granted, free of charge, to any person or organization
-    obtaining a copy of the software and accompanying documentation covered by
-    this license (the "Software") to use, reproduce, display, distribute,
-    execute, and transmit the Software, and to prepare derivative works of the
-    Software, and to permit third-parties to whom the Software is furnished to
-    do so, all subject to the following:
-    
-    The copyright notices in the Software and this entire statement, including
-    the above license grant, this restriction and the following disclaimer,
-    must be included in all copies of the Software, in whole or in part, and
-    all derivative works of the Software, unless such copies or derivative
-    works are solely in the form of machine-executable object code generated by
-    a source language processor.
-    
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-    FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
-    SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
-    FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
-    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
-    DEALINGS IN THE SOFTWARE.
-
diff --git a/README.md b/README.md
index a98077d94..b885389d4 100644
--- a/README.md
+++ b/README.md
@@ -1,75 +1,253 @@
-Thrust: Code at the speed of light
-==================================
+:warning: **The Thrust repository has been archived and is now part of the unified [nvidia/cccl repository](https://github.com/nvidia/cccl). See the [announcement here](https://github.com/NVIDIA/cccl/discussions/520) for more information. Please visit the new repository for the latest updates.** :warning:
 
-Thrust is a parallel algorithms library which resembles the C++ Standard
-Template Library (STL). Thrust's **high-level** interface greatly enhances
-programmer **productivity** while enabling performance portability between
-GPUs and multicore CPUs. **Interoperability** with established technologies
-(such as CUDA, TBB, and OpenMP) facilitates integration with existing
-software. Develop **high-performance** applications rapidly with Thrust!
+# Thrust: The C++ Parallel Algorithms Library
 
-Examples
---------
+<table><tr>
+<th><b><a href="https://github.com/nvidia/thrust/tree/main/examples">Examples</a></b></th>
+<th><b><a href="https://godbolt.org/z/8E8W764E6">Godbolt</a></b></th>
+<th><b><a href="https://nvidia.github.io/thrust">Documentation</a></b></th>
+</tr></table>
 
-Thrust is best explained through examples. The following source code
-generates random numbers serially and then transfers them to a parallel
-device where they are sorted.
+Thrust is the C++ parallel algorithms library which inspired the introduction
+  of parallel algorithms to the C++ Standard Library.
+Thrust's **high-level** interface greatly enhances programmer **productivity**
+  while enabling performance portability between GPUs and multicore CPUs.
+It builds on top of established parallel programming frameworks (such as CUDA,
+  TBB, and OpenMP).
+It also provides a number of general-purpose facilities similar to those found
+  in the C++ Standard Library.
 
-```c++
+The NVIDIA C++ Standard Library is an open source project; it is available on
+  [GitHub] and included in the NVIDIA HPC SDK and CUDA Toolkit.
+If you have one of those SDKs installed, no additional installation or compiler
+  flags are needed to use libcu++.
+
+## Examples
+
+Thrust is best learned through examples.
+
+The following example generates random numbers serially and then transfers them
+  to a parallel device where they are sorted.
+
+```cuda
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
 #include <thrust/sort.h>
 #include <thrust/copy.h>
-#include <algorithm>
-#include <cstdlib>
+#include <thrust/random.h>
 
-int main(void)
-{
-  // generate 32M random numbers serially
+int main() {
+  // Generate 32M random numbers serially.
+  thrust::default_random_engine rng(1337);
+  thrust::uniform_int_distribution<int> dist;
   thrust::host_vector<int> h_vec(32 << 20);
-  std::generate(h_vec.begin(), h_vec.end(), rand);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
 
-  // transfer data to the device
+  // Transfer data to the device.
   thrust::device_vector<int> d_vec = h_vec;
 
-  // sort data on the device (846M keys per second on GeForce GTX 480)
+  // Sort data on the device.
   thrust::sort(d_vec.begin(), d_vec.end());
 
-  // transfer data back to host
+  // Transfer data back to host.
   thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
-
-  return 0;
 }
 ```
-  
-This code sample computes the sum of 100 random numbers in parallel:
 
-```c++
+[See it on Godbolt](https://godbolt.org/z/GeWEd8Er9)
+
+This example demonstrates computing the sum of some random numbers in parallel:
+
+```cuda
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/generate.h>
 #include <thrust/reduce.h>
 #include <thrust/functional.h>
-#include <algorithm>
-#include <cstdlib>
+#include <thrust/random.h>
 
-int main(void)
-{
-  // generate random data serially
-  thrust::host_vector<int> h_vec(100);
-  std::generate(h_vec.begin(), h_vec.end(), rand);
+int main() {
+  // Generate random data serially.
+  thrust::default_random_engine rng(1337);
+  thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
+  thrust::host_vector<double> h_vec(32 << 20);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
 
-  // transfer to device and compute sum
-  thrust::device_vector<int> d_vec = h_vec;
-  int x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
-  return 0;
+  // Transfer to device and compute the sum.
+  thrust::device_vector<double> d_vec = h_vec;
+  double x = thrust::reduce(d_vec.begin(), d_vec.end(), 0, thrust::plus<int>());
 }
 ```
-    
-Refer to the [Quick Start Guide](http://github.com/thrust/thrust/wiki/Quick-Start-Guide) page for further information and examples.
 
-Contributors
-------------
+[See it on Godbolt](https://godbolt.org/z/cnsbWWME7)
+
+This example show how to perform such a reduction asynchronously:
+
+```cuda
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/generate.h>
+#include <thrust/async/copy.h>
+#include <thrust/async/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/random.h>
+#include <numeric>
+
+int main() {
+  // Generate 32M random numbers serially.
+  thrust::default_random_engine rng(123456);
+  thrust::uniform_real_distribution<double> dist(-50.0, 50.0);
+  thrust::host_vector<double> h_vec(32 << 20);
+  thrust::generate(h_vec.begin(), h_vec.end(), [&] { return dist(rng); });
+
+  // Asynchronously transfer to the device.
+  thrust::device_vector<double> d_vec(h_vec.size());
+  thrust::device_event e = thrust::async::copy(h_vec.begin(), h_vec.end(),
+                                               d_vec.begin());
+
+  // After the transfer completes, asynchronously compute the sum on the device.
+  thrust::device_future<double> f0 = thrust::async::reduce(thrust::device.after(e),
+                                                           d_vec.begin(), d_vec.end(),
+                                                           0.0, thrust::plus<double>());
+
+  // While the sum is being computed on the device, compute the sum serially on
+  // the host.
+  double f1 = std::accumulate(h_vec.begin(), h_vec.end(), 0.0, thrust::plus<double>());
+}
+```
+
+[See it on Godbolt](https://godbolt.org/z/be54efaKj)
+
+## Getting The Thrust Source Code
+
+Thrust is a header-only library; there is no need to build or install the project
+unless you want to run the Thrust unit tests.
+
+The CUDA Toolkit provides a recent release of the Thrust source code in
+`include/thrust`. This will be suitable for most users.
+
+Users that wish to contribute to Thrust or try out newer features should
+recursively clone the Thrust Github repository:
+
+```
+git clone --recursive https://github.com/NVIDIA/thrust.git
+```
+
+## Using Thrust From Your Project
+
+For CMake-based projects, we provide a CMake package for use with
+`find_package`. See the [CMake README](thrust/cmake/README.md) for more
+information. Thrust can also be added via `add_subdirectory` or tools like
+the [CMake Package Manager](https://github.com/cpm-cmake/CPM.cmake).
+
+For non-CMake projects, compile with:
+- The Thrust include path (`-I<thrust repo root>`)
+- The libcu++ include path (`-I<thrust repo root>/dependencies/libcudacxx/`)
+- The CUB include path, if using the CUDA device system (`-I<thrust repo root>/dependencies/cub/`)
+- By default, the CPP host system and CUDA device system are used.
+  These can be changed using compiler definitions:
+  - `-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_XXX`,
+     where `XXX` is `CPP` (serial, default), `OMP` (OpenMP), or `TBB` (Intel TBB)
+  - `-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_XXX`, where `XXX` is
+    `CPP`, `OMP`, `TBB`, or `CUDA` (default).
+
+## Developing Thrust
+
+Thrust uses the [CMake build system] to build unit tests, examples, and header
+  tests.
+To build Thrust as a developer, it is recommended that you use our
+  containerized development system:
+
+```bash
+# Clone Thrust and CUB repos recursively:
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+
+# Build and run tests and examples:
+ci/local/build.bash
+```
+
+That does the equivalent of the following, but in a clean containerized
+  environment which has all dependencies installed:
+
+```bash
+# Clone Thrust and CUB repos recursively:
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..   # Command line interface.
+ccmake ..  # ncurses GUI (Linux only).
+cmake-gui  # Graphical UI, set source/build directories in the app.
+
+# Build:
+cmake --build . -j ${NUM_JOBS} # Invokes make (or ninja, etc).
+
+# Run tests and examples:
+ctest
+```
+
+By default, a serial `CPP` host system, `CUDA` accelerated device system, and
+  C++14 standard are used.
+This can be changed in CMake and via flags to `ci/local/build.bash`
+
+More information on configuring your Thrust build and creating a pull request
+  can be found in the [contributing section].
+
+## Licensing
+
+Thrust is an open source project developed on [GitHub].
+Thrust is distributed under the [Apache License v2.0 with LLVM Exceptions];
+  some parts are distributed under the [Apache License v2.0] and the
+  [Boost License v1.0].
+
+## CI Status
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-gpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%209%20build%20and%20device%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%2011%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%2010%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%209%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%208%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%207%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=6,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%206%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=gcc,CXX_VER=5,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20GCC%205%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=12,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2012%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=11,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2011%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=10,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%2010%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=9,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%209%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=8,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%208%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=clang,CXX_VER=7,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20Clang%207%20build%20and%20host%20tests'></a>
+
+<a href='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/'><img src='https://gpuci.gpuopenanalytics.com/job/nvidia/job/thrust/job/branch/job/thrust-cpu-build/CXX_TYPE=icc,CXX_VER=latest,OS_TYPE=ubuntu,OS_VER=20.04,SDK_TYPE=cuda,SDK_VER=11.7.0-devel/badge/icon?subject=NVCC%2011.7.0%20%2B%20ICC%20build%20and%20host%20tests'></a>
+
+
+
+[GitHub]: https://github.com/nvidia/thrust
+
+[CMake section]: https://nvidia.github.io/thrust/setup/cmake_options.html
+[contributing section]: https://nvidia.github.io/thrust/contributing.html
+
+[CMake build system]: https://cmake.org
+
+[Apache License v2.0 with LLVM Exceptions]: https://llvm.org/LICENSE.txt
+[Apache License v2.0]: https://www.apache.org/licenses/LICENSE-2.0.txt
+[Boost License v1.0]: https://www.boost.org/LICENSE_1_0.txt
 
-The original creators of Thrust are [Jared Hoberock](http://github.com/jaredhoberock) and [Nathan Bell](http://research.nvidia.com/users/nathan-bell).
diff --git a/SConscript b/SConscript
deleted file mode 100644
index 39797f99f..000000000
--- a/SConscript
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-import re
-Import('env')
-
-# clone the environment so as not to pollute the parent
-my_env = env.Clone()
-
-# divine the version number from thrust/version.h
-version = int(re.search('THRUST_VERSION ([0-9]+)', File('#thrust/version.h').get_contents()).group(1))
-major   = int(version / 100000)
-minor   = int(version / 100) % 1000
-subminor = version % 100
-
-# create the Thrust zip
-for item in my_env.RecursiveGlob('*', '#thrust'):
-  my_env.InstallAs(os.path.join('thrust', Dir('#thrust').rel_path(item)), item)
-# grab the CHANGELOG as well
-my_env.Install('thrust', '#CHANGELOG')
-
-# make sure to change directory into the variant dir to ensure the paths are correct in the zipfile
-# note Zip uses the special site_scons/site_tools/zip.py to WAR an issue with the chdir parameter
-thrust_zipfile = my_env.Zip('thrust-{0}.{1}.{2}.zip'.format(major,minor,subminor), 'thrust', chdir = 1)
-my_env.Alias('dist', thrust_zipfile)
-
-
-# create the examples zip
-# do not recurse into the 'targets' directory, should it exist
-for item in my_env.RecursiveGlob('*', '#examples', 'targets'):
-  # avoid included SCons-related files in the distribution
-  # XXX would be nice if we could ignore all dotfiles and anything in .gitignore
-  if item.get_path(item.get_dir()) not in ['SConscript','.sconsign.dblite']:
-    my_env.InstallAs(os.path.join('examples', Dir('#examples').rel_path(item)), item)
-# make sure to change directory into the variant dir to ensure the paths are correct in the zipfile
-# note Zip uses the special site_scons/site_tools/zip.py to WAR an issue with the chdir parameter
-examples_zipfile = my_env.Zip('examples-{0}.{1}.zip'.format(major,minor), 'examples', chdir = 1)
-my_env.Alias('dist', examples_zipfile)
-
-# generate documentation
-# note that thrust.dox instructs doxygen to output to the targets directory
-public_headers = my_env.RecursiveGlob('*.h', '#thrust', exclude='detail')
-thrust_docs = my_env.Command('doc/html', public_headers, 'doxygen doc/thrust.dox')
-my_env.Alias('doc', thrust_docs)
-
diff --git a/SConstruct b/SConstruct
deleted file mode 100644
index e96445c13..000000000
--- a/SConstruct
+++ /dev/null
@@ -1,476 +0,0 @@
-"""Exports a SCons construction environment 'env' with configuration common to all build projects"""
-EnsureSConsVersion(1,2)
-
-import os
-import platform
-import glob
-import itertools
-import subprocess
-
-
-def RecursiveGlob(env, pattern, directory = Dir('.'), exclude = '\B'):
-  """Recursively globs a directory and its children, returning a list of sources.
-  Allows exclusion of directories given a regular expression.
-  """
-  directory = Dir(directory)
-
-  result = directory.glob(pattern)
-
-  for n in directory.glob('*'):
-    # only recurse into directories which aren't in the blacklist
-    import re
-    if isinstance(n,type(directory)) and not re.match(exclude, directory.rel_path(n)):
-      result.extend(RecursiveGlob(env, pattern, n, exclude))
-  return result
-
-
-# map features to the list of compiler switches implementing them
-gnu_compiler_flags = {
-  'warn_all'           : ['-Wall'],
-  'warnings_as_errors' : ['-Werror'],
-  'release'            : ['-O2'],
-  'debug'              : ['-g'],
-  'exception_handling' : [],
-  'cpp'                : [],
-  'omp'                : ['-fopenmp'],
-  'tbb'                : [],
-  'cuda'               : [],
-  'workarounds'        : []
-}
-
-clang_compiler_flags = {
-  'warn_all'           : ['-Wall'],
-  'warnings_as_errors' : ['-Werror'],
-  'release'            : ['-O2'],
-  'debug'              : ['-g'],
-  'exception_handling' : [],
-  'cpp'                : [],
-  'omp'                : ['-fopenmp'],
-  'tbb'                : [],
-  'cuda'               : [],
-  'workarounds'        : []
-}
-
-msvc_compiler_flags = {
-  'warn_all'           : ['/Wall'],
-  'warnings_as_errors' : ['/WX'],
-  'release'            : ['/Ox'],
-  'debug'              : ['/Zi', '-D_DEBUG', '/MTd'],
-  'exception_handling' : ['/EHsc'],
-  'cpp'                : [],
-  'omp'                : ['/openmp'],
-  'tbb'                : [],
-  'cuda'               : [],
-
-  # avoid min/max problems due to windows.h
-  # suppress warnings due to "decorated name length exceeded"
-  'workarounds'        : ['/DNOMINMAX', '/wd4503']
-}
-
-compiler_to_flags = {
-  'g++' : gnu_compiler_flags,
-  'cl'  : msvc_compiler_flags,
-  'clang++'  : clang_compiler_flags
-}
-
-gnu_linker_flags = {
-  'debug'       : [],
-  'release'     : [],
-  'workarounds' : []
-}
-
-nv_linker_flags = gnu_linker_flags
-
-clang_linker_flags = {
-  'debug'       : [],
-  'release'     : [],
-  'workarounds' : ['-stdlib=libstdc++']
-}
-
-msvc_linker_flags = {
-  'debug'       : ['/debug'],
-  'release'     : [],
-  'workarounds' : ['/nologo']
-}
-
-linker_to_flags = {
-  'gcc'  : gnu_linker_flags,
-  'link' : msvc_linker_flags,
-  'nvcc' : nv_linker_flags,
-  'clang++'  : clang_linker_flags
-}
-
-
-def cuda_installation():
-  """Returns the details of CUDA's installation
-  returns (bin_path,lib_path,inc_path,library_name)
-  """
-
-  # determine defaults
-  if os.name == 'nt':
-    bin_path = 'C:/CUDA/bin'
-    lib_path = 'C:/CUDA/lib'
-    inc_path = 'C:/CUDA/include'
-  elif os.name == 'posix':
-    bin_path = '/usr/local/cuda/bin'
-    lib_path = '/usr/local/cuda/lib'
-    inc_path = '/usr/local/cuda/include'
-  else:
-    raise ValueError, 'Error: unknown OS.  Where is nvcc installed?'
-
-  if master_env['PLATFORM'] != 'darwin' and platform.machine()[-2:] == '64':
-    lib_path += '64'
-
-  # override with environement variables
-  if 'CUDA_BIN_PATH' in os.environ:
-    bin_path = os.path.abspath(os.environ['CUDA_BIN_PATH'])
-  if 'CUDA_LIB_PATH' in os.environ:
-    lib_path = os.path.abspath(os.environ['CUDA_LIB_PATH'])
-  if 'CUDA_INC_PATH' in os.environ:
-    inc_path = os.path.abspath(os.environ['CUDA_INC_PATH'])
-
-  return (bin_path,lib_path,inc_path,'cudart')
-
-
-def omp_installation(CXX):
-  """Returns the details of OpenMP's installation
-  returns (bin_path,lib_path,inc_path,library_name)
-  """
-
-  bin_path = ''
-  lib_path = ''
-  inc_path = ''
-
-  # the name of the library is compiler-dependent
-  library_name = ''
-  if CXX == 'g++':
-    library_name = 'gomp'
-  elif CXX == 'cl':
-    library_name = 'VCOMP'
-  elif CXX == 'clang++':
-    raise NotImplementedError, "OpenMP not supported together with clang"
-  else:
-    raise ValueError, "Unknown compiler. What is the name of the OpenMP library?"
-
-  return (bin_path,lib_path,inc_path,library_name)
-
-
-def tbb_installation(env):
-  """Returns the details of TBB's installation
-  returns (bin_path,lib_path,inc_path,library_name)
-  """
-
-  # determine defaults
-  if os.name == 'nt':
-    try:
-      # we assume that TBBROOT exists in the environment
-      root = env['ENV']['TBBROOT']
-
-      # choose bitness
-      bitness = 'ia32'
-      if platform.machine()[-2:] == '64':
-        bitness = 'intel64'
-
-      # choose msvc version
-      msvc_version = 'vc' + str(int(float(env['MSVC_VERSION'])))
-      
-      # assemble paths
-      bin_path = os.path.join(root, 'bin', bitness, msvc_version)
-      lib_path = os.path.join(root, 'lib', bitness, msvc_version)
-      inc_path = os.path.join(root, 'include')
-        
-    except:
-      raise ValueError, 'Where is TBB installed?'
-  else:
-    bin_path = ''
-    lib_path = ''
-    inc_path = ''
-
-  return (bin_path,lib_path,inc_path,'tbb')
-
-
-def inc_paths(env, host_backend, device_backend):
-  """Returns a list of include paths needed by the compiler"""
-  result = []
-  thrust_inc_path = Dir('.')
-
-  # note that the thrust path comes before the cuda path, which
-  # may itself contain a different version of thrust
-  result.append(thrust_inc_path)
-  
-  if host_backend == 'cuda' or device_backend == 'cuda':
-    cuda_inc_path = cuda_installation()[2]
-    result.append(cuda_inc_path)
-
-  if host_backend == 'tbb' or device_backend == 'tbb':
-    tbb_inc_path  = tbb_installation(env)[2]
-    result.append(tbb_inc_path)
-
-  return result
-  
-
-def lib_paths(env, host_backend, device_backend):
-  """Returns a list of lib paths needed by the linker"""
-  result = []
-
-  if host_backend == 'cuda' or device_backend == 'cuda':
-    cuda_lib_path = cuda_installation()[1]
-    result.append(cuda_lib_path)
-
-  if host_backend == 'tbb' or device_backend == 'tbb':
-    tbb_lib_path  = tbb_installation(env)[1]
-    result.append(tbb_lib_path)
-
-  return result
-
-
-def libs(env, CCX, host_backend, device_backend):
-  """Returns a list of libraries to link against"""
-  result = []
-
-  # when compiling with g++, link against the standard library
-  # we don't have to do this with cl
-  if CCX == 'g++':
-    result.append('stdc++')
-    result.append('m')
-
-  # link against backend-specific runtimes
-  if host_backend == 'cuda' or device_backend == 'cuda':
-    result.append(cuda_installation()[3])
-
-    # XXX clean this up
-    if env['cdp']:
-      result.append('cudadevrt')
-
-  if host_backend == 'omp' or device_backend == 'omp':
-    result.append(omp_installation(CCX)[3])
-
-  if host_backend == 'tbb' or device_backend == 'tbb':
-    result.append(tbb_installation(env)[3])
-
-  return result
-
-
-def linker_flags(LINK, mode, platform, device_backend, arch):
-  """Returns a list of command line flags needed by the linker"""
-  result = []
-
-  flags = linker_to_flags[LINK]
-
-  # debug/release
-  result.extend(flags[mode])
-
-  # unconditional workarounds
-  result.extend(flags['workarounds'])
-
-  return result
-
-  
-def macros(mode, host_backend, device_backend):
-  """Returns a list of preprocessor macros needed by the compiler"""
-  result = []
-
-  # backend defines
-  result.append('-DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_' + host_backend.upper())
-  result.append('-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_' + device_backend.upper())
-
-  if mode == 'debug':
-    # turn on thrust debug mode
-    result.append('-DTHRUST_DEBUG')
-
-  return result
-
-
-def cc_compiler_flags(CXX, mode, platform, host_backend, device_backend, warn_all, warnings_as_errors):
-  """Returns a list of command line flags needed by the c or c++ compiler"""
-  # start with all platform-independent preprocessor macros
-  result = macros(mode, host_backend, device_backend)
-
-  flags = compiler_to_flags[CXX]
-
-  # continue with unconditional flags
-
-  # exception handling
-  result.extend(flags['exception_handling'])
-
-  # finish with conditional flags
-
-  # debug/release
-  result.extend(flags[mode])
-
-  # enable host_backend code generation
-  result.extend(flags[host_backend])
-
-  # enable device_backend code generation
-  result.extend(flags[device_backend])
-
-  # Wall
-  if warn_all:
-    result.extend(flags['warn_all'])
-
-  # Werror 
-  if warnings_as_errors:
-    result.extend(flags['warnings_as_errors'])
-
-  # workarounds
-  result.extend(flags['workarounds'])
-
-  return result
-
-
-def nv_compiler_flags(mode, device_backend, arch, cdp):
-  """Returns a list of command line flags specific to nvcc"""
-  result = []
-  for machine_arch in arch:
-    # transform arch_XX to compute_XX
-    virtual_arch = machine_arch.replace('sm','compute')
-    # the weird -gencode flag is formatted like this:
-    # -gencode=arch=compute_10,code=\"sm_20,compute_20\"
-    result.append('-gencode=arch={0},\\"code={1},{2}\\"'.format(virtual_arch, machine_arch, virtual_arch))
-
-  if mode == 'debug':
-    # turn on debug mode
-    # XXX make this work when we've debugged nvcc -G
-    #result.append('-G')
-    pass
-  if device_backend != 'cuda':
-    result.append("--x=c++")
-  if cdp != False:
-    result.append("-rdc=true")
-
-  if device_backend == 'cuda' and master_env['PLATFORM'] == 'darwin':
-    (release, versioninfo, machine) = platform.mac_ver()
-    if(release[0:5] == '10.8.'):
-      result.append('-ccbin')
-      result.append(master_env.subst('$CXX'))
-
-  return result
-
-
-def command_line_variables():
-  # allow the user discretion to select the MSVC version
-  vars = Variables()
-  if os.name == 'nt':
-    vars.Add(EnumVariable('MSVC_VERSION', 'MS Visual C++ version', None, allowed_values=('8.0', '9.0', '10.0')))
-  
-  # add a variable to handle the host backend
-  vars.Add(ListVariable('host_backend', 'The host backend to target', 'cpp',
-                        ['cpp', 'omp', 'tbb']))
-  
-  # add a variable to handle the device backend
-  vars.Add(ListVariable('device_backend', 'The parallel device backend to target', 'cuda',
-                        ['cuda', 'omp', 'tbb', 'cpp']))
-  
-  # add a variable to handle release/debug mode
-  vars.Add(EnumVariable('mode', 'Release versus debug mode', 'release',
-                        allowed_values = ('release', 'debug')))
-  
-  # XXX allow the option to send sm_1x to nvcc even nvcc may not support it
-  vars.Add(ListVariable('arch', 'Compute capability code generation', 'sm_20',
-                        ['sm_10', 'sm_11', 'sm_12', 'sm_13',
-                         'sm_20', 'sm_21',
-                         'sm_30', 'sm_32', 'sm_35', 'sm_37',
-                         'sm_50']))
-
-  # add a variable to handle CUDA dynamic parallelism
-  vars.Add(BoolVariable('cdp', 'Enable CUDA dynamic parallelism', False))
-  
-  # add a variable to handle warnings
-  # only enable Wall by default on compilers other than cl
-  vars.Add(BoolVariable('Wall', 'Enable all compilation warnings', os.name != 'nt'))
-  
-  # add a variable to treat warnings as errors
-  vars.Add(BoolVariable('Werror', 'Treat warnings as errors', os.name != 'nt'))
-
-  return vars
-
-
-# create a master Environment
-vars = command_line_variables()
-
-master_env = Environment(variables = vars, tools = ['default', 'nvcc', 'zip'])
-
-# XXX it might be a better idea to harvest help text from subsidiary
-#     SConscripts and only add their help text if one of their targets
-#     is scheduled to be built
-Help(vars.GenerateHelpText(master_env))
-
-# enable RecursiveGlob
-master_env.AddMethod(RecursiveGlob)
-
-# add CUDA's lib dir to LD_LIBRARY_PATH so that we can execute commands
-# which depend on shared libraries (e.g., cudart)
-# we don't need to do this on windows
-if master_env['PLATFORM'] == 'posix':
-  master_env['ENV'].setdefault('LD_LIBRARY_PATH', []).append(cuda_installation()[1])
-elif master_env['PLATFORM'] == 'darwin':
-  master_env['ENV'].setdefault('DYLD_LIBRARY_PATH', []).append(cuda_installation()[1])
-  # Check if g++ really is g++
-  if(master_env.subst('$CXX') == 'g++'):
-    output = subprocess.check_output(['g++','--version'])
-    if(output.find('clang') != -1):
-      # It's actually clang
-      master_env.Replace(CXX = 'clang++')
-  if(master_env.subst('$CC') == 'gcc'):
-    output = subprocess.check_output(['gcc','--version'])
-    if(output.find('clang') != -1):
-      # It's actually clang
-      master_env.Replace(CC = 'clang')
-  if(master_env.subst('$LINK') == 'clang'):
-    master_env.Replace(CC = 'clang++')
-
-elif master_env['PLATFORM'] == 'win32':
-  master_env['ENV']['TBBROOT'] = os.environ['TBBROOT']
-  master_env['ENV']['PATH'] += ';' + tbb_installation(master_env)[0]
-
-
-# get the list of requested backends
-host_backends = master_env.subst('$host_backend').split()
-device_backends = master_env.subst('$device_backend').split()
-
-for (host,device) in itertools.product(host_backends, device_backends):
-  # clone the master environment for this config
-  env = master_env.Clone()
-
-  # populate the environment
-  env.Append(CPPPATH = inc_paths(env, host, device))
-  
-  env.Append(CCFLAGS = cc_compiler_flags(env.subst('$CXX'), env['mode'], env['PLATFORM'], host, device, env['Wall'], env['Werror']))
-  
-  env.Append(NVCCFLAGS = nv_compiler_flags(env['mode'], device, env['arch'], env['cdp']))
-  
-  env.Append(LIBS = libs(env, env.subst('$CXX'), host, device))
-
-  # XXX this probably doesn't belong here
-  # XXX ideally we'd integrate this into site_scons
-  if 'cudadevrt' in env['LIBS']:
-    # nvcc is required to link against cudadevrt
-    env.Replace(LINK = 'nvcc')
-
-    if os.name == 'nt':
-      # the nv linker uses the same command line as the gnu linker
-      env['LIBDIRPREFIX'] = '-L'
-      env['LIBLINKPREFIX'] = '-l'
-      env['LIBLINKSUFFIX'] = ''
-      env.Replace(LINKCOM = '$LINK -o $TARGET $LINKFLAGS $__RPATH $SOURCES $_LIBDIRFLAGS $_LIBFLAGS')
-
-  # we Replace instead of Append, to avoid picking-up MSVC-specific flags on Windows
-  env.Replace(LINKFLAGS = linker_flags(env.subst('$LINK'), env['mode'], env['PLATFORM'], device, env['arch']))
-   
-  env.Append(LIBPATH = lib_paths(env, host, device))
-  
-  # assemble the name of this configuration's targets directory
-  targets_dir = 'targets/{0}_host_{1}_device_{2}'.format(host, device, env['mode'])
-
-  # allow subsidiary SConscripts to peek at the backends
-  env['host_backend'] = host
-  env['device_backend'] = device
-  
-  # invoke each SConscript with a variant directory
-  env.SConscript('examples/SConscript',    exports='env', variant_dir = 'examples/'    + targets_dir, duplicate = 0)
-  env.SConscript('testing/SConscript',     exports='env', variant_dir = 'testing/'     + targets_dir, duplicate = 0)
-  env.SConscript('performance/SConscript', exports='env', variant_dir = 'performance/' + targets_dir, duplicate = 0)
-
-env = master_env
-master_env.SConscript('SConscript', exports='env', variant_dir = 'targets', duplicate = False)
-
diff --git a/THANKS b/THANKS
deleted file mode 100644
index 5829b113f..000000000
--- a/THANKS
+++ /dev/null
@@ -1,32 +0,0 @@
-Thrust is an open source library of parallel algorithms with an interface
-resembling the C++ Standard Template Library (STL).  The primary developers
-of Thrust are Jared Hoberock [1] and Nathan Bell [2] of NVIDIA Research.
-
-We wish to thank the following people, who have made important intellectual
-and/or software contributions to the project:
-
- * Andrew Corrigan
- * David Tarjan
- * Duane Merrill
- * Erich Elsen
- * Gregory Diamos
- * Manjunath Kudlur
- * Mark Harris
- * Michael Garland
- * Nadathur Satish
- * Nathan Whitehead
- * Ryuta Suzuki
- * Shubho Sengupta
- * Thomas Bradley
-
-We also thank the compiler group at NVIDIA for their continued improvements to
-nvcc. In particular, we appreciate the work Bastiaan Aarts has done to enhance
-nvcc's C++ support.
-
-Lastly, Thrust has greatly benefited from the design and implementation of 
-the Boost Iterator, Tuple, System, Phoenix, and Random Number libraries [3].
-
-[1] http://research.nvidia.com/users/jared-hoberock
-[2] http://research.nvidia.com/users/nathan-bell
-[3] http://www.boost.org/
-
diff --git a/ci/axis/cpu.yml b/ci/axis/cpu.yml
new file mode 100644
index 000000000..cc393169d
--- /dev/null
+++ b/ci/axis/cpu.yml
@@ -0,0 +1,61 @@
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+SDK_TYPE:
+  - cuda
+
+SDK_VER:
+  - 11.7.0-devel
+
+OS_TYPE:
+  - ubuntu
+
+OS_VER:
+  - 20.04
+
+CXX_TYPE:
+  - clang
+  - gcc
+  - icc
+
+CXX_VER:
+  - 5
+  - 6
+  - 7
+  - 8
+  - 9
+  - 10
+  - 11
+  - 12
+  - latest
+
+exclude:
+  # Excludes by `CXX_VER`.
+  - CXX_TYPE: gcc
+    CXX_VER: 12
+  - CXX_TYPE: gcc
+    CXX_VER: latest
+  - CXX_TYPE: clang
+    CXX_VER: 5
+  - CXX_TYPE: clang
+    CXX_VER: 6
+  - CXX_TYPE: clang
+    CXX_VER: latest
+  - CXX_TYPE: icc
+    CXX_VER: 5
+  - CXX_TYPE: icc
+    CXX_VER: 6
+  - CXX_TYPE: icc
+    CXX_VER: 7
+  - CXX_TYPE: icc
+    CXX_VER: 8
+  - CXX_TYPE: icc
+    CXX_VER: 9
+  - CXX_TYPE: icc
+    CXX_VER: 10
+  - CXX_TYPE: icc
+    CXX_VER: 11
+  - CXX_TYPE: icc
+    CXX_VER: 12
diff --git a/ci/axis/gpu.yml b/ci/axis/gpu.yml
new file mode 100644
index 000000000..550083aab
--- /dev/null
+++ b/ci/axis/gpu.yml
@@ -0,0 +1,22 @@
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+SDK_TYPE:
+  - cuda
+
+SDK_VER:
+  - 11.7.0-devel
+
+OS_TYPE:
+  - ubuntu
+
+OS_VER:
+  - 20.04
+
+CXX_TYPE:
+  - gcc
+
+CXX_VER:
+  - 9
diff --git a/ci/common/build.bash b/ci/common/build.bash
new file mode 100755
index 000000000..37aafaf8b
--- /dev/null
+++ b/ci/common/build.bash
@@ -0,0 +1,439 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2022 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI
+################################################################################
+
+set -e # Stop on errors.
+
+# append variable value
+# Appends ${value} to ${variable}, adding a space before ${value} if
+# ${variable} is not empty.
+function append {
+  tmp="${!1:+${!1} }${2}"
+  eval "${1}=\${tmp}"
+}
+
+# log args...
+# Prints out ${args[*]} with a gpuCI log prefix and a newline before and after.
+function log() {
+  printf "\n>>>> %s\n\n" "${*}"
+}
+
+# print_with_trailing_blank_line args...
+# Prints ${args[*]} with one blank line following, preserving newlines within
+# ${args[*]} but stripping any preceding ${args[*]}.
+function print_with_trailing_blank_line {
+  printf "%s\n\n" "${*}"
+}
+
+# echo_and_run name args...
+# Echo ${args[@]}, then execute ${args[@]}
+function echo_and_run {
+  echo "${1}: ${@:2}"
+  ${@:2}
+}
+
+# echo_and_run_timed name args...
+# Echo ${args[@]}, then execute ${args[@]} and report how long it took,
+# including ${name} in the output of the time.
+function echo_and_run_timed {
+  echo "${@:2}"
+  TIMEFORMAT=$'\n'"${1} Time: %lR"
+  time ${@:2}
+}
+
+# join_delimit <delimiter> [value [value [...]]]
+# Combine all values into a single string, separating each by a single character
+# delimiter. Eg:
+# foo=(bar baz kramble)
+# joined_foo=$(join_delimit "|" "${foo[@]}")
+# echo joined_foo # "bar|baz|kramble"
+function join_delimit {
+  local IFS="${1}"
+  shift
+  echo "${*}"
+}
+
+################################################################################
+# VARIABLES - Set up bash and environmental variables.
+################################################################################
+
+# Get the variables the Docker container set up for us: ${CXX}, ${CUDACXX}, etc.
+set +e # Don't stop on errors from /etc/cccl.bashrc.
+source /etc/cccl.bashrc
+set -e # Stop on errors.
+
+# Configure sccache.
+if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+  log "Disabling sccache (nvcxx not supported)"
+  unset ENABLE_SCCACHE
+elif [[ "${BUILD_MODE}" == "pull-request" || "${BUILD_MODE}" == "branch" ]]; then
+  # gpuCI builds cache in S3.
+  export ENABLE_SCCACHE="gpuCI"
+  # Change to 'thrust-aarch64' if we add aarch64 builds to gpuCI:
+  export SCCACHE_S3_KEY_PREFIX=thrust-linux64 # [linux64]
+  export SCCACHE_BUCKET=rapids-sccache-east
+  export SCCACHE_REGION=us-east-2
+  export SCCACHE_IDLE_TIMEOUT=32768
+else
+  export ENABLE_SCCACHE="local"
+  # local builds cache locally
+  export SCCACHE_DIR="${WORKSPACE}/build-sccache"
+fi
+
+# Set sccache compiler flags
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  export CMAKE_CUDA_COMPILER_LAUNCHER="sccache"
+  export CMAKE_CXX_COMPILER_LAUNCHER="sccache"
+  export CMAKE_C_COMPILER_LAUNCHER="sccache"
+fi
+
+# Set path.
+export PATH=/usr/local/cuda/bin:${PATH}
+
+# Set home to the job's workspace.
+export HOME=${WORKSPACE}
+
+# Per-process memory util logs:
+MEMMON_LOG=${WORKSPACE}/build/memmon_log
+
+# Switch to the build directory.
+cd ${WORKSPACE}
+mkdir -p build
+cd build
+
+# Remove any old .ninja_log file so the PrintNinjaBuildTimes step is accurate:
+rm -f .ninja_log
+
+if [[ -z "${CMAKE_BUILD_TYPE}" ]]; then
+  CMAKE_BUILD_TYPE="Release"
+fi
+
+CMAKE_BUILD_FLAGS="--"
+
+# The Docker image sets up `${CXX}` and `${CUDACXX}`.
+append CMAKE_FLAGS "-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER='${CUDACXX}'"
+
+if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+  # NVC++ isn't properly detected by CMake, so we have to tell CMake to ignore
+  # detection and explicit provide the compiler ID. Ninja currently isn't
+  # supported, so we just use makefiles.
+  append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_FORCED=ON"
+  append CMAKE_FLAGS "-DCMAKE_CUDA_COMPILER_ID=NVCXX"
+  # We use NVC++ "slim" image which only contain a single CUDA toolkit version.
+  # When using NVC++ in an environment without GPUs (like our CPU-only
+  # builders) it unfortunately defaults to the oldest CUDA toolkit version it
+  # supports, even if that version is not in the image. So, we have to
+  # explicitly tell NVC++ it which CUDA toolkit version to use.
+  CUDA_VER=$(echo ${SDK_VER} | sed 's/.*\(cuda[0-9]\+\.[0-9]\+\)/\1/')
+  append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-gpu=${CUDA_VER}"
+  # Don't stop on build failures.
+  append CMAKE_BUILD_FLAGS "-k"
+else
+  if [[ "${CXX_TYPE}" == "icc" ]]; then
+    # Only the latest version of the Intel C++ compiler, which NVCC doesn't
+    # officially support yet, is freely available.
+    append CMAKE_FLAGS "-DCMAKE_CUDA_FLAGS=-allow-unsupported-compiler"
+  fi
+  # We're using NVCC so we need to set the host compiler.
+  append CMAKE_FLAGS "-DCMAKE_CXX_COMPILER='${CXX}'"
+  append CMAKE_FLAGS "-G Ninja"
+  # Don't stop on build failures.
+  append CMAKE_BUILD_FLAGS "-k0"
+fi
+
+DETERMINE_PARALLELISM_FLAGS=""
+
+# Used to limit the number of default build threads. Any build/link
+# steps that exceed this limit will cause this script to report a
+# failure. Tune this using the memmon logs printed after each run.
+#
+# Build steps that take more memory than this limit should
+# be split into multiple steps/translation units. Any temporary
+# increases to this threshold should be reverted ASAP. The goal
+# to do decrease this as much as possible and not increase it.
+if [[ -z "${MIN_MEMORY_PER_THREAD}" ]]; then
+  if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+      MIN_MEMORY_PER_THREAD=3.0 # GiB
+  elif [[ "${CXX_TYPE}" == "icc" ]]; then
+      MIN_MEMORY_PER_THREAD=2.5 # GiB
+  else
+      MIN_MEMORY_PER_THREAD=2.0 # GiB
+  fi
+fi
+append DETERMINE_PARALLELISM_FLAGS "--min-memory-per-thread ${MIN_MEMORY_PER_THREAD}"
+
+if [[ -n "${PARALLEL_LEVEL}" ]]; then
+  append DETERMINE_PARALLELISM_FLAGS "-j ${PARALLEL_LEVEL}"
+fi
+
+# COVERAGE_PLAN options:
+# * Exhaustive
+# * Thorough
+# * Minimal
+if [[ -z "${COVERAGE_PLAN}" ]]; then
+  # `ci/local/build.bash` always sets a coverage plan, so we can assume we're
+  # in gpuCI if one was not set.
+  if [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+    # Today, NVC++ builds take too long to do anything more than Minimal.
+    COVERAGE_PLAN="Minimal"
+  elif [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${BUILD_MODE}" == "branch" ]]; then
+    # Post-commit CPU CI builds.
+    COVERAGE_PLAN="Exhaustive"
+  elif [[ "${BUILD_TYPE}" == "cpu" ]]; then
+    # Pre-commit CPU CI builds.
+    COVERAGE_PLAN="Thorough"
+  elif [[ "${BUILD_TYPE}" == "gpu" ]]; then
+    # Pre- and post-commit GPU CI builds.
+    COVERAGE_PLAN="Minimal"
+  fi
+fi
+
+case "${COVERAGE_PLAN}" in
+  Exhaustive)
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=LARGE"
+    ;;
+  Thorough)
+    # Build the legacy bench.cu. We'll probably want to remove this when we
+    # switch to the new, heavier thrust_benchmarks project.
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_BENCHMARKS=ON"
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_IGNORE_DEPRECATED_CPP_11=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_ALL=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON"
+    if [[ "${CXX_TYPE}" != "nvcxx" ]]; then
+      # NVC++ can currently only target one compute architecture at a time.
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_50=ON"
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_60=ON"
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_70=ON"
+    fi
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON"
+    ;;
+  Minimal)
+    append CMAKE_FLAGS "-DTHRUST_ENABLE_MULTICONFIG=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_DIALECT_LATEST=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CPP=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_TBB=OFF"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_OMP=OFF"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA=ON"
+    append CMAKE_FLAGS "-DTHRUST_MULTICONFIG_WORKLOAD=SMALL"
+    append CMAKE_FLAGS "-DTHRUST_INCLUDE_CUB_CMAKE=ON"
+    append CMAKE_FLAGS "-DTHRUST_AUTO_DETECT_COMPUTE_ARCHS=ON"
+    if [[ "${BUILD_TYPE}" == "cpu" ]] && [[ "${CXX_TYPE}" == "nvcxx" ]]; then
+      # If no GPU is automatically detected, NVC++ insists that you explicitly
+      # provide an architecture.
+      # TODO: This logic should really be moved into CMake, but it will be
+      # tricky to do that until CMake officially supports NVC++.
+      append CMAKE_FLAGS "-DTHRUST_ENABLE_COMPUTE_80=ON"
+    fi
+    ;;
+esac
+
+if [[ -n "${@}" ]]; then
+  append CMAKE_BUILD_FLAGS "${@}"
+fi
+
+append CTEST_FLAGS "--output-on-failure"
+
+CTEST_EXCLUSION_REGEXES=()
+
+if [[ "${BUILD_TYPE}" == "cpu" ]]; then
+  CTEST_EXCLUSION_REGEXES+=("^cub" "^thrust.*cuda")
+fi
+
+if [[ -n "${CTEST_EXCLUSION_REGEXES[@]}" ]]; then
+  CTEST_EXCLUSION_REGEX=$(join_delimit "|" "${CTEST_EXCLUSION_REGEXES[@]}")
+  append CTEST_FLAGS "-E ${CTEST_EXCLUSION_REGEX}"
+fi
+
+if [[ -n "${@}" ]]; then
+  CTEST_INCLUSION_REGEX=$(join_delimit "|" "${@}")
+  append CTEST_FLAGS "-R ^${CTEST_INCLUSION_REGEX[@]}$"
+fi
+
+# Export variables so they'll show up in the logs when we report the environment.
+export COVERAGE_PLAN
+export CMAKE_FLAGS
+export CMAKE_BUILD_FLAGS
+export CTEST_FLAGS
+
+################################################################################
+# ENVIRONMENT - Configure and print out information about the environment.
+################################################################################
+
+log "Determine system topology..."
+
+# Set `${PARALLEL_LEVEL}` if it is unset; otherwise, this just reports the
+# system topology.
+source ${WORKSPACE}/ci/common/determine_build_parallelism.bash ${DETERMINE_PARALLELISM_FLAGS}
+
+log "Get environment..."
+
+env | sort
+
+log "Check versions..."
+
+# We use sed and echo below to ensure there is always one and only trailing
+# line following the output from each tool.
+
+${CXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+echo
+
+${CUDACXX} --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+echo
+
+cmake --version 2>&1 | sed -Ez '$ s/\n*$/\n/'
+
+if [[ "${BUILD_TYPE}" == "gpu" ]]; then
+  echo
+  nvidia-smi 2>&1 | sed -Ez '$ s/\n*$/\n/'
+fi
+
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  echo
+  # Set sccache statistics to zero to capture clean run.
+  sccache --version
+  sccache --zero-stats | grep location
+fi
+
+################################################################################
+# BUILD - Build Thrust and CUB examples and tests.
+################################################################################
+
+log "Configure Thrust and CUB..."
+
+echo_and_run_timed "Configure" cmake .. --log-level=VERBOSE ${CMAKE_FLAGS}
+configure_status=$?
+
+log "Build Thrust and CUB..."
+
+# ${PARALLEL_LEVEL} needs to be passed after we run
+# determine_build_parallelism.bash, so it can't be part of ${CMAKE_BUILD_FLAGS}.
+set +e # Don't stop on build failures.
+
+# Monitor memory usage. Thresholds in GiB:
+python3 ${WORKSPACE}/ci/common/memmon.py \
+	--log-threshold 0.0 \
+	--fail-threshold ${MIN_MEMORY_PER_THREAD} \
+	--log-file ${MEMMON_LOG} \
+        &
+memmon_pid=$!
+
+echo_and_run_timed "Build" cmake --build . ${CMAKE_BUILD_FLAGS} -j ${PARALLEL_LEVEL}
+build_status=$?
+
+# Stop memmon:
+kill -s SIGINT ${memmon_pid}
+
+# Re-enable exit on failure:
+set -e
+
+################################################################################
+# TEST - Run Thrust and CUB examples and tests.
+################################################################################
+
+log "Test Thrust and CUB..."
+
+(
+  # Make sure test_status captures ctest, not tee:
+  # https://stackoverflow.com/a/999259/11130318
+  set -o pipefail
+  echo_and_run_timed "Test" ctest ${CTEST_FLAGS} | tee ctest_log
+)
+test_status=$?
+
+################################################################################
+# COMPILATION STATS
+################################################################################
+
+if [[ -n "${ENABLE_SCCACHE}" ]]; then
+  # Get sccache stats after the compile is completed
+  COMPILE_REQUESTS=$(sccache -s | grep "Compile requests \+ [0-9]\+$" | awk '{ print $NF }')
+  CACHE_HITS=$(sccache -s | grep "Cache hits \+ [0-9]\+$" | awk '{ print $NF }')
+  HIT_RATE=$(echo - | awk "{printf \"%.2f\n\", $CACHE_HITS / $COMPILE_REQUESTS * 100}")
+  log "sccache stats (${HIT_RATE}% hit):"
+  sccache -s
+fi
+
+################################################################################
+# COMPILE TIME INFO: Print the 20 longest running build steps (ninja only)
+################################################################################
+
+if [[ -f ".ninja_log" ]]; then
+  log "Checking slowest build steps:"
+  echo_and_run "CompileTimeInfo" cmake -P ../cmake/PrintNinjaBuildTimes.cmake | head -n 23
+fi
+
+################################################################################
+# RUNTIME INFO: Print the 20 longest running test steps
+################################################################################
+
+if [[ -f "ctest_log" ]]; then
+  log "Checking slowest test steps:"
+  echo_and_run "TestTimeInfo" cmake -DLOGFILE=ctest_log -P ../cmake/PrintCTestRunTimes.cmake | head -n 20
+fi
+
+################################################################################
+# MEMORY_USAGE
+################################################################################
+
+memmon_status=0
+if [[ -f "${MEMMON_LOG}" ]]; then
+  log "Checking memmon logfile: ${MEMMON_LOG}"
+
+  if [[ -n "$(grep -E "^FAIL" ${MEMMON_LOG})" ]]; then
+    log "error: Some build steps exceeded memory threshold (${MIN_MEMORY_PER_THREAD} GiB):"
+    grep -E "^FAIL" ${MEMMON_LOG}
+    memmon_status=1
+  else
+    log "Top memory usage per build step (all less than limit of ${MIN_MEMORY_PER_THREAD} GiB):"
+    if [[ -s ${MEMMON_LOG} ]]; then
+      # Not empty:
+      head -n5 ${MEMMON_LOG}
+    else
+      echo "None detected above logging threshold."
+    fi
+  fi
+fi
+
+################################################################################
+# SUMMARY - Print status of each step and exit with failure if needed.
+################################################################################
+
+log "Summary:"
+echo "Warnings:"
+# Not currently a failure; sccache makes these unreliable and intermittent:
+echo "- Build Memory Check: ${memmon_status}"
+echo "Failures:"
+echo "- Configure Error Code: ${configure_status}"
+echo "- Build Error Code: ${build_status}"
+echo "- Test Error Code: ${test_status}"
+
+if [[ "${configure_status}" != "0" ]] || \
+   [[ "${build_status}" != "0" ]] || \
+   [[ "${test_status}" != "0" ]]; then
+     exit 1
+fi
diff --git a/ci/common/determine_build_parallelism.bash b/ci/common/determine_build_parallelism.bash
new file mode 100755
index 000000000..9813fcb2f
--- /dev/null
+++ b/ci/common/determine_build_parallelism.bash
@@ -0,0 +1,119 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+function usage {
+  echo "Usage: ${0} [flags...]"
+  echo
+  echo "Examine the system topology to determine a reasonable amount of build"
+  echo "parallelism."
+  echo
+  echo "Exported variables:"
+  echo "  \${LOGICAL_CPUS}          : Logical processors (e.g. threads)."
+  echo "  \${PHYSICAL_CPUS}         : Physical processors (e.g. cores)."
+  echo "  \${TOTAL_MEM}             : Total system memory [GB]."
+  echo "  \${MAX_THREADS_PER_CORE}  : Maximum threads per core allowed."
+  echo "  \${MIN_MEMORY_PER_THREAD} : Minimum memory [GB] per thread allowed."
+  echo "  \${CPU_BOUND_THREADS}     : # of build threads constrained by processors."
+  echo "  \${MEM_BOUND_THREADS}     : # of build threads constrained by memory [GB]."
+  echo "  \${PARALLEL_LEVEL}        : Determined # of build threads."
+  echo "  \${MEM_PER_THREAD}        : Memory [GB] per build thread."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-q, --quiet"
+  echo "  Print nothing and only export variables."
+  echo
+  echo "-j <threads>, --jobs <threads>"
+  echo "  Explicitly set the number of build threads to use."
+  echo
+  echo "--max-threads-per-core <threads>"
+  echo "  Specify the maximum threads per core allowed (default: ${MAX_THREADS_PER_CORE} [threads/core])."
+  echo
+  echo "--min-memory-per-thread <gigabytes>"
+  echo "  Specify the minimum memory per thread allowed (default: ${MIN_MEMORY_PER_THREAD} [GBs/thread])."
+
+  exit -3
+}
+
+QUIET=0
+
+export MAX_THREADS_PER_CORE=2
+export MIN_MEMORY_PER_THREAD=4 # [GB]
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -q) ;&
+  --quiet) QUIET=1 ;;
+  -j) ;&
+  --jobs)
+    shift # The next argument is the number of threads.
+    PARALLEL_LEVEL="${1}"
+    ;;
+  --max-threads-per-core)
+    shift # The next argument is the number of threads per core.
+    MAX_THREADS_PER_CORE="${1}"
+    ;;
+  --min-memory-per-thread)
+    shift # The next argument is the amount of memory per thread.
+    MIN_MEMORY_PER_THREAD="${1}"
+    ;;
+  esac
+  shift
+done
+
+# https://stackoverflow.com/a/23378780
+if [ $(uname) == "Darwin" ]; then
+  export LOGICAL_CPUS=$(sysctl -n hw.logicalcpu_max)
+  export PHYSICAL_CPUS=$(sysctl -n hw.physicalcpu_max)
+else
+  export LOGICAL_CPUS=$(lscpu -p | egrep -v '^#' | wc -l)
+  export PHYSICAL_CPUS=$(lscpu -p | egrep -v '^#' | sort -u -t, -k 2,4 | wc -l)
+fi
+
+export TOTAL_MEM=$(awk "BEGIN { printf \"%0.4g\", $(grep MemTotal /proc/meminfo | awk '{ print $2 }') / (1024 * 1024) }")
+
+export CPU_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${PHYSICAL_CPUS} * ${MAX_THREADS_PER_CORE}) }")
+export MEM_BOUND_THREADS=$(awk "BEGIN { printf \"%.04g\", int(${TOTAL_MEM} / ${MIN_MEMORY_PER_THREAD}) }")
+
+if [[ -z "${PARALLEL_LEVEL}" ]]; then
+  # Pick the smaller of the two as the default.
+  if [[ "${MEM_BOUND_THREADS}" -lt "${CPU_BOUND_THREADS}" ]]; then
+    export PARALLEL_LEVEL=${MEM_BOUND_THREADS}
+  else
+    export PARALLEL_LEVEL=${CPU_BOUND_THREADS}
+  fi
+else
+  EXPLICIT_PARALLEL_LEVEL=1
+fi
+
+# This can be a floating point number.
+export MEM_PER_THREAD=$(awk "BEGIN { printf \"%.04g\", ${TOTAL_MEM} / ${PARALLEL_LEVEL} }")
+
+if [[ "${QUIET}" == 0 ]]; then
+  echo    "Logical CPUs:           ${LOGICAL_CPUS} [threads]"
+  echo    "Physical CPUs:          ${PHYSICAL_CPUS} [cores]"
+  echo    "Total Mem:              ${TOTAL_MEM} [GBs]"
+  echo    "Max Threads Per Core:   ${MAX_THREADS_PER_CORE} [threads/core]"
+  echo    "Min Memory Per Threads: ${MIN_MEMORY_PER_THREAD} [GBs/thread]"
+  echo    "CPU Bound Threads:      ${CPU_BOUND_THREADS} [threads]"
+  echo    "Mem Bound Threads:      ${MEM_BOUND_THREADS} [threads]"
+
+  echo -n "Parallel Level:         ${PARALLEL_LEVEL} [threads]"
+  if [[ -n "${EXPLICIT_PARALLEL_LEVEL}" ]]; then
+    echo " (explicitly set)"
+  else
+    echo
+  fi
+
+  echo    "Mem Per Thread:         ${MEM_PER_THREAD} [GBs/thread]"
+fi
+
diff --git a/ci/common/memmon.py b/ci/common/memmon.py
new file mode 100755
index 000000000..505503733
--- /dev/null
+++ b/ci/common/memmon.py
@@ -0,0 +1,110 @@
+#! /usr/bin/env python
+
+# Copyright (c) 2022 NVIDIA Corporation
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+help_text = """%(prog)s [reference.json compare.json | reference_dir/ compare_dir/]
+
+This script:
+
+1. Runs `top -bco RES`, continuously extracting the memory usage of each process.
+2. If a process uses more than `log_threshold` GiB and exceeds any other recorded
+   entry for the process, it is stored in `entries`.
+3. When this script receives SIGINT, it writes two files:
+  * `log_file` will contain all recorded max-memory-per-process entries
+  * `fail_file` will contain all entries that exceed `fail_threshold`
+"""
+
+import argparse
+import os
+import re
+import signal
+import sys
+
+from subprocess import Popen, PIPE, STDOUT
+
+parser = argparse.ArgumentParser(prog='memmon.py', usage=help_text)
+parser.add_argument('--log-threshold', type=float, dest='log_threshold',
+                    default=0.5,
+                    help='Logging threshold in GiB.')
+parser.add_argument('--fail-threshold', type=float, dest='fail_threshold',
+                    default=2,
+                    help='Failure threshold in GiB.')
+parser.add_argument('--log-file', type=str, dest='log_file', default='memmon_log',
+                    help='Output file for log entries.')
+args, unused = parser.parse_known_args()
+
+entries = {}
+
+
+def signal_handler(sig, frame):
+    # Sort by mem:
+    sortentries = sorted(entries.items(), key=lambda x: x[1], reverse=True)
+
+    lf = open(args.log_file, "w")
+
+    for com, mem in sortentries:
+        status = "PASS"
+        if mem >= args.fail_threshold:
+            status = "FAIL"
+        line = "%4s | %3.1f GiB | %s\n" % (status, mem, com)
+        lf.write(line)
+
+    lf.close()
+    sys.exit(0)
+
+
+signal.signal(signal.SIGINT, signal_handler)
+
+# Find the toprc config file and configure top's env.
+# This config:
+# - Hides all columns except for RES and COMMAND
+# - Sorts by RES
+# - Enables long command strings (-c)
+script_dir = os.path.dirname(os.path.realpath(__file__))
+config_dir = os.path.join(script_dir, 'memmon_config')
+
+proc = Popen(["top", "-b", "-w", "512"],
+             stdin=PIPE, stdout=PIPE, stderr=STDOUT,
+             env={"XDG_CONFIG_HOME": config_dir})
+
+regex = re.compile("^\\s*([0-9.]+[kmgtp]?)\\s+(.+)\\s*$")
+
+
+# Convert a memory string from top into floating point GiB
+def parse_mem(mem_str):
+    if mem_str[-1] == "k":
+        return float(mem_str[:-1]) / (1024 * 1024)
+    elif mem_str[-1] == "m":
+        return float(mem_str[:-1]) / (1024)
+    elif mem_str[-1] == "g":
+        return float(mem_str[:-1])
+    elif mem_str[-1] == "t":
+        return float(mem_str[:-1]) * 1024
+    elif mem_str[-1] == "p":  # please no
+        return float(mem_str[:-1]) * 1024 * 1024
+    # bytes:
+    return float(mem_str) / (1024 * 1024 * 1024)
+
+
+for line in proc.stdout:
+    line = line.decode()
+    match = regex.match(line)
+    if match:
+        mem = parse_mem(match.group(1))
+        if mem < args.log_threshold and mem < args.fail_threshold:
+            continue
+        com = match.group(2)
+        if com in entries and entries[com] > mem:
+            continue
+        if mem >= args.fail_threshold:
+            # Print a notice immediately -- this helps identify the failures
+            # as they happen, since `com` may not provide enough info.
+            print("memmon.py failure: Build step exceed memory threshold:\n"
+                  "  - Threshold: %3.1f GiB\n"
+                  "  - Usage:     %3.1f GiB\n"
+                  "  - Command:   %s" % (args.fail_threshold, mem, com))
+        entries[com] = mem
diff --git a/ci/common/memmon_config/procps/toprc b/ci/common/memmon_config/procps/toprc
new file mode 100644
index 000000000..883a482ce
--- /dev/null
+++ b/ci/common/memmon_config/procps/toprc
@@ -0,0 +1,16 @@
+top's Config File (Linux processes with windows)
+Id:i, Mode_altscr=0, Mode_irixps=1, Delay_time=3.0, Curwin=0
+Def	fieldscur=%(34;�@D7:9�&')*+,-./012568<>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193972, sortindx=18, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=1, msgsclr=1, headclr=3, taskclr=1
+Job	fieldscur=�����(��Ļ�@<��)*+,-./012568>?ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=0, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=6, msgsclr=6, headclr=7, taskclr=6
+Mem	fieldscur=���<�����MBN�D34��&'()*+,-./0125689FGHIJKLOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=21, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=5, msgsclr=5, headclr=4, taskclr=5
+Usr	fieldscur=�����������)+,-./1234568;<=>?@ABCFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghij
+	winflags=193844, sortindx=3, maxtasks=0, graph_cpus=0, graph_mems=0
+	summclr=3, msgsclr=3, headclr=2, taskclr=3
+Fixed_widest=0, Summ_mscale=1, Task_mscale=0, Zero_suppress=0
+
diff --git a/ci/cpu/build.bash b/ci/cpu/build.bash
new file mode 100755
index 000000000..69b99bbec
--- /dev/null
+++ b/ci/cpu/build.bash
@@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI (CPU-only)
+################################################################################
+
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/gpu/build.bash b/ci/gpu/build.bash
new file mode 100755
index 000000000..f6cdf021c
--- /dev/null
+++ b/ci/gpu/build.bash
@@ -0,0 +1,14 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB build script for gpuCI (heterogeneous)
+################################################################################
+
+export PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+
+source ${WORKSPACE}/ci/common/build.bash
diff --git a/ci/local/build.bash b/ci/local/build.bash
new file mode 100755
index 000000000..8b20ef063
--- /dev/null
+++ b/ci/local/build.bash
@@ -0,0 +1,224 @@
+#! /usr/bin/env bash
+
+# Copyright (c) 2018-2020 NVIDIA Corporation
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# Released under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+
+################################################################################
+# Thrust and CUB local containerized build script
+################################################################################
+
+function usage {
+  echo "Usage: ${0} [flags...] [cmake-targets...]"
+  echo
+  echo "Build and test your local repository using a gpuCI Docker image."
+  echo "If CMake targets are specified, only those targets are built and tested."
+  echo "Otherwise, everything is built and tested."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-r <path>, --repository <path>"
+  echo "  Path to the repository (default: ${REPOSITORY_PATH})."
+  echo
+  echo "-i <image>, --image <image>"
+  echo "  Docker image to use (default: ${IMAGE})"
+  echo
+  echo "-l, --local-image"
+  echo "  Use the local version of the image instead of pulling from Docker hub."
+  echo
+  echo "-s, --shell-only"
+  echo "  Skip building and testing and launch an interactive shell instead."
+  echo
+  echo "-d, --disable-gpus"
+  echo "  Don't start the container with the NVIDIA runtime and GPUs attached."
+  echo
+  echo "-c, --clean"
+  echo "  If the build directory already exists, delete it."
+  echo
+  echo "-j <threads>, --jobs <threads>"
+  echo "  Number of threads to use when building (default: inferred)."
+  echo
+  echo "-b <type>, --cmake-build-type <plan>"
+  echo "  CMake build type to use, either Release, RelWithDebInfo, or Debug"
+  echo "  (default: ${CMAKE_BUILD_TYPE})."
+  echo
+  echo "-p <plan>, --coverage-plan <plan>"
+  echo "  Coverage plan to use, either Exhaustive, Thorough, or Minimal"
+  echo "  (default: ${COVERAGE_PLAN})."
+  echo
+
+  exit -3
+}
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPOSITORY_PATH=$(realpath ${SCRIPT_PATH}/../..)
+
+################################################################################
+# FLAGS - Process command line flags.
+################################################################################
+
+IMAGE="gpuci/cccl:cuda11.7.0-devel-ubuntu20.04-gcc9"
+
+LOCAL_IMAGE=0
+
+SHELL_ONLY=0
+
+BUILD_TYPE="gpu"
+
+CLEAN=0
+
+PARALLEL_LEVEL=""
+
+CMAKE_BUILD_TYPE="Release"
+
+COVERAGE_PLAN="Minimal"
+
+TARGETS=""
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -r) ;&
+  --repository)
+    shift # The next argument is the path.
+    REPOSITORY_PATH="${1}"
+    ;;
+  -i) ;&
+  --image)
+    shift # The next argument is the image.
+    IMAGE="${1}"
+    ;;
+  -l) ;&
+  --local-image) LOCAL_IMAGE=1 ;;
+  -s) ;&
+  --shell-only) SHELL_ONLY=1 ;;
+  -d) ;&
+  --disable-gpus) BUILD_TYPE="cpu" ;;
+  -c) ;&
+  --clean) CLEAN=1 ;;
+  -j) ;&
+  --jobs)
+    shift # The next argument is the number of threads.
+    PARALLEL_LEVEL="${1}"
+    ;;
+  -b) ;&
+  --cmake-build-type)
+    shift # The next argument is the build type.
+    CMAKE_BUILD_TYPE="${1}"
+    ;;
+  -p) ;&
+  --coverage-plan)
+    shift # The next argument is the coverage plan.
+    COVERAGE_PLAN="${1}"
+    ;;
+  *)
+    TARGETS="${TARGETS:+${TARGETS} }${1}"
+    ;;
+  esac
+  shift
+done
+
+################################################################################
+# PATHS - Setup paths for the container.
+################################################################################
+
+# ${REPOSITORY_PATH} is the local filesystem path to the Git repository being
+# built and tested. It can be set with the --repository flag.
+#
+# ${BUILD_PATH} is the local filesystem path that will be used for the build. It
+# is named after the image name, allowing multiple image builds to coexist on
+# the local filesystem.
+#
+# ${REPOSITORY_PATH_IN_CONTAINER} is the location of ${REPOSITORY_PATH} inside
+# the container.
+#
+# ${BUILD_PATH_IN_CONTAINER} is the location of ${BUILD_PATH} inside the
+# container.
+
+BUILD_PATH=${REPOSITORY_PATH}/build_$(echo "$(basename "${IMAGE}")" | sed -e 's/:/_/g' | sed -e 's/-/_/g')
+
+if [[ "${CLEAN}" != 0 ]]; then
+  rm -rf ${BUILD_PATH}
+fi
+
+mkdir -p ${BUILD_PATH}
+
+BASE_PATH_IN_CONTAINER="/cccl"
+
+REPOSITORY_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")"
+
+BUILD_PATH_IN_CONTAINER="${BASE_PATH_IN_CONTAINER}/$(basename "${REPOSITORY_PATH}")/build"
+
+################################################################################
+# ENVIRONMENT - Setup the thunk build script that will be run by the container.
+################################################################################
+
+# We have to run `ldconfig` to rebuild `ld.so.cache` to work around this
+# failure on Debian: https://github.com/NVIDIA/nvidia-docker/issues/1399
+
+COMMAND="sudo ldconfig; sudo ldconfig"
+if [[ "${SHELL_ONLY}" != 0 ]]; then
+  COMMAND="${COMMAND}; bash"
+else
+  COMMAND="${COMMAND}; ${REPOSITORY_PATH_IN_CONTAINER}/ci/common/build.bash ${TARGETS} || bash"
+fi
+
+################################################################################
+# GPU - Setup GPUs.
+################################################################################
+
+if [[ "${BUILD_TYPE}" == "gpu" ]]; then
+  # Limit GPUs available to the container based on ${CUDA_VISIBLE_DEVICES}.
+  if [[ -z "${CUDA_VISIBLE_DEVICES}" ]]; then
+    VISIBLE_DEVICES="all"
+  else
+    VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}"
+  fi
+
+  DOCKER_MAJOR_VER=$(docker -v | sed 's/[^[0-9]*\([0-9]*\).*/\1/')
+  GPU_OPTS="--gpus device=${VISIBLE_DEVICES}"
+  if [[ "${DOCKER_MAJOR_VER}" -lt 19 ]]
+  then
+    GPU_OPTS="--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES='${VISIBLE_DEVICES}'"
+  fi
+fi
+
+################################################################################
+# LAUNCH - Pull and launch the container.
+################################################################################
+
+NVIDIA_DOCKER_INSTALLED=$(docker info 2>&1 | grep -i runtime | grep -c nvidia)
+if [[ "${NVIDIA_DOCKER_INSTALLED}" == 0 ]]; then
+  echo "NVIDIA Docker not found, the build may fail."
+  echo "Please install it if you encounter issues: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-docker-ce"
+fi
+
+if [[ "${LOCAL_IMAGE}" == 0 ]]; then
+  docker pull "${IMAGE}"
+fi
+
+docker run --rm -it ${GPU_OPTS} \
+  --cap-add=SYS_PTRACE \
+  --user "$(id -u)":"$(id -g)" \
+  -v "${REPOSITORY_PATH}":"${REPOSITORY_PATH_IN_CONTAINER}" \
+  -v "${BUILD_PATH}":"${BUILD_PATH_IN_CONTAINER}" \
+  -v /etc/passwd:/etc/passwd:ro \
+  -v /etc/group:/etc/group:ro \
+  -v /etc/subuid:/etc/subuid:ro \
+  -v /etc/subgid:/etc/subgid:ro \
+  -v /etc/shadow:/etc/shadow:ro \
+  -v /etc/gshadow:/etc/gshadow:ro \
+  -e "WORKSPACE=${REPOSITORY_PATH_IN_CONTAINER}" \
+  -e "BUILD_TYPE=${BUILD_TYPE}" \
+  -e "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}" \
+  -e "COVERAGE_PLAN=${COVERAGE_PLAN}" \
+  -e "PARALLEL_LEVEL=${PARALLEL_LEVEL}" \
+  -w "${BUILD_PATH_IN_CONTAINER}" \
+  "${IMAGE}" bash -c "${COMMAND}"
+
diff --git a/cmake/AppendOptionIfAvailable.cmake b/cmake/AppendOptionIfAvailable.cmake
new file mode 100644
index 000000000..52dc12216
--- /dev/null
+++ b/cmake/AppendOptionIfAvailable.cmake
@@ -0,0 +1,14 @@
+include_guard(GLOBAL)
+include(CheckCXXCompilerFlag)
+
+macro (APPEND_OPTION_IF_AVAILABLE _FLAG _LIST)
+
+string(MAKE_C_IDENTIFIER "CXX_FLAG_${_FLAG}" _VAR)
+check_cxx_compiler_flag(${_FLAG} ${_VAR})
+
+if (${${_VAR}})
+  list(APPEND ${_LIST} ${_FLAG})
+endif ()
+
+endmacro ()
+
diff --git a/cmake/DetectSupportedStandards.cmake b/cmake/DetectSupportedStandards.cmake
new file mode 100644
index 000000000..5dceefdab
--- /dev/null
+++ b/cmake/DetectSupportedStandards.cmake
@@ -0,0 +1,47 @@
+# Detect the langauge standards supported by the current compilers.
+#
+# Usage: detect_supported_cxx_standards(<var_prefix> <lang> <standards>)
+#
+# - var_prefix: Used to name result variables,
+#   e.g. ${var_prefix}_${lang}_XX_SUPPORTED will be TRUE or FALSE. Defined for
+#   each XX in ${standards}.
+# - lang: The language to test: C, CXX, or CUDA.
+# - standards: List of any standard versions.
+#
+# Example: detect_supported_cxx_standards(PROJ CXX 11 14 17)
+#   - Sets the following variables in the parent scope to TRUE or FALSE:
+#     - PROJ_CXX_11_SUPPORTED
+#     - PROJ_CXX_14_SUPPORTED
+#     - PROJ_CXX_17_SUPPORTED
+#
+function(detect_supported_standards prefix lang)
+  string(TOLOWER "${lang}_std" feature_prefix)
+  foreach(standard IN LISTS ARGN)
+    set(var_name "${prefix}_${lang}_${standard}_SUPPORTED")
+    if ("${feature_prefix}_${standard}" IN_LIST CMAKE_${lang}_COMPILE_FEATURES)
+      set(${var_name} TRUE)
+    else()
+      set(${var_name} FALSE)
+    endif()
+
+
+    if (standard EQUAL 17 AND
+        (lang STREQUAL "CXX" OR lang STREQUAL "CUDA") AND
+        ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 7) OR
+         (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" AND
+          CMAKE_CXX_COMPILER_VERSION VERSION_LESS 8)))
+      # Special cases:
+      # gcc < 7 and clang < 8 don't fully support C++17.
+      # They accept the flag and have partial support, but nvcc will refuse
+      # to enable it and falls back to the default dialect for the current
+      # CXX compiler version. This breaks our CI.
+      # CMake's COMPILE_FEATURES var reports that these compilers support C++17,
+      # but we can't rely on it, so manually disable the dialect in these cases.
+      set(${var_name} FALSE)
+    endif()
+
+    message(STATUS "Testing ${lang}${standard} Support: ${${var_name}}")
+    set(${var_name} ${${var_name}} PARENT_SCOPE)
+  endforeach()
+endfunction()
diff --git a/cmake/PrintCTestRunTimes.cmake b/cmake/PrintCTestRunTimes.cmake
new file mode 100644
index 000000000..bf23b9bb6
--- /dev/null
+++ b/cmake/PrintCTestRunTimes.cmake
@@ -0,0 +1,109 @@
+## This CMake script parses the output of ctest and prints a formatted list
+## of individual test runtimes, sorted longest first.
+##
+## ctest > ctest_log
+## cmake -DLOGFILE=ctest_log \
+##       -P PrintCTestRunTimes.cmake
+##
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  message(FATAL_ERROR "Missing -DLOGFILE=<ctest output> argument.")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+string(JOIN "" regex
+  "^[ ]*[0-9]+/[0-9]+[ ]+Test[ ]+#"
+  "([0-9]+)"                          # Test ID
+  ":[ ]+"
+  "(.+)"                              # Test Name
+  "[ ]+\\.+[ ]+"
+  "(.+[^ ])"                              # Result
+  "[ ]+"
+  "([0-9]+)"                          # Seconds
+  "\\.[0-9]+[ ]+sec[ ]*$"
+)
+
+message(DEBUG "Regex: ${regex}")
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH "${regex}" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 4)
+    set(test_id      "${CMAKE_MATCH_1}")
+    set(test_name    "${CMAKE_MATCH_2}")
+    set(test_result  "${CMAKE_MATCH_3}")
+    set(tmp          "${CMAKE_MATCH_4}") # floor(runtime_seconds)
+
+    # Compute human readable time
+    math(EXPR days         "${tmp} / (60 * 60 * 24)")
+    math(EXPR tmp          "${tmp} - (${days} * 60 * 60 * 24)")
+    math(EXPR hours        "${tmp} / (60 * 60)")
+    math(EXPR tmp          "${tmp} - (${hours} * 60 * 60)")
+    math(EXPR minutes      "${tmp} / (60)")
+    math(EXPR tmp          "${tmp} - (${minutes} * 60)")
+    math(EXPR seconds      "${tmp}")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${test_id}" key)
+    string(JOIN " | " ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s"
+      "${test_result}"
+      "${test_id}: ${test_name}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no test times ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries ORDER DESCENDING)
+
+# Dump table:
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/cmake/PrintNinjaBuildTimes.cmake b/cmake/PrintNinjaBuildTimes.cmake
new file mode 100644
index 000000000..65d243d35
--- /dev/null
+++ b/cmake/PrintNinjaBuildTimes.cmake
@@ -0,0 +1,101 @@
+## This CMake script parses a .ninja_log file (LOGFILE) and prints a list of
+## build/link times, sorted longest first.
+##
+## cmake -DLOGFILE=<.ninja_log file> \
+##       -P PrintNinjaBuildTimes.cmake
+##
+## If LOGFILE is omitted, the current directory's .ninja_log file is used.
+################################################################################
+
+cmake_minimum_required(VERSION 3.15)
+
+# Prepend the string with "0" until the string length equals the specified width
+function(pad_string_with_zeros string_var width)
+  set(local_string "${${string_var}}")
+  string(LENGTH "${local_string}" size)
+  while(size LESS width)
+    string(PREPEND local_string "0")
+    string(LENGTH "${local_string}" size)
+  endwhile()
+  set(${string_var} "${local_string}" PARENT_SCOPE)
+endfunction()
+
+################################################################################
+
+if (NOT LOGFILE)
+  set(LOGFILE ".ninja_log")
+endif()
+
+# Check if logfile exists
+if (NOT EXISTS "${LOGFILE}")
+  message(FATAL_ERROR "LOGFILE does not exist ('${LOGFILE}').")
+endif()
+
+# Read the logfile and generate a map / keylist
+set(keys)
+file(STRINGS "${LOGFILE}" lines)
+foreach(line ${lines})
+
+  # Parse each build time
+  string(REGEX MATCH
+    "^([0-9]+)\t([0-9]+)\t[0-9]+\t([^\t]+)+\t[0-9a-fA-F]+$" _DUMMY "${line}")
+
+  if (CMAKE_MATCH_COUNT EQUAL 3)
+    set(start_ms ${CMAKE_MATCH_1})
+    set(end_ms ${CMAKE_MATCH_2})
+    set(command "${CMAKE_MATCH_3}")
+    math(EXPR runtime_ms "${end_ms} - ${start_ms}")
+
+    # Compute human readable time
+    math(EXPR days         "${runtime_ms} / (1000 * 60 * 60 * 24)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${days} * 1000 * 60 * 60 * 24)")
+    math(EXPR hours        "${runtime_ms} / (1000 * 60 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${hours} * 1000 * 60 * 60)")
+    math(EXPR minutes      "${runtime_ms} / (1000 * 60)")
+    math(EXPR runtime_ms   "${runtime_ms} - (${minutes} * 1000 * 60)")
+    math(EXPR seconds      "${runtime_ms} / 1000")
+    math(EXPR milliseconds "${runtime_ms} - (${seconds} * 1000)")
+
+    # Format time components
+    pad_string_with_zeros(days 3)
+    pad_string_with_zeros(hours 2)
+    pad_string_with_zeros(minutes 2)
+    pad_string_with_zeros(seconds 2)
+    pad_string_with_zeros(milliseconds 3)
+
+    # Construct table entry
+    # Later values in the file for the same command overwrite earlier entries
+    string(MAKE_C_IDENTIFIER "${command}" key)
+    set(ENTRY_${key}
+      "${days}d ${hours}h ${minutes}m ${seconds}s ${milliseconds}ms | ${command}"
+    )
+
+    # Record the key:
+    list(APPEND keys "${key}")
+  endif()
+endforeach()
+
+list(REMOVE_DUPLICATES keys)
+
+# Build the entry list:
+set(entries)
+foreach(key ${keys})
+  list(APPEND entries "${ENTRY_${key}}")
+endforeach()
+
+if (NOT entries)
+  message(FATAL_ERROR "LOGFILE contained no build entries ('${LOGFILE}').")
+endif()
+
+# Sort in descending order:
+list(SORT entries)
+list(REVERSE entries)
+
+# Dump table:
+message(STATUS "-----------------------+----------------------------")
+message(STATUS "Time                   | Command                    ")
+message(STATUS "-----------------------+----------------------------")
+
+foreach(entry ${entries})
+  message(STATUS ${entry})
+endforeach()
diff --git a/cmake/ThrustAddSubdir.cmake b/cmake/ThrustAddSubdir.cmake
new file mode 100644
index 000000000..d48aa1415
--- /dev/null
+++ b/cmake/ThrustAddSubdir.cmake
@@ -0,0 +1,6 @@
+find_package(Thrust REQUIRED CONFIG
+  NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+  HINTS "${CMAKE_CURRENT_LIST_DIR}/.."
+  COMPONENTS ${THRUST_REQUIRED_SYSTEMS}
+  OPTIONAL_COMPONENTS ${THRUST_OPTIONAL_SYSTEMS}
+)
diff --git a/cmake/ThrustBuildCompilerTargets.cmake b/cmake/ThrustBuildCompilerTargets.cmake
new file mode 100644
index 000000000..aed0ec170
--- /dev/null
+++ b/cmake/ThrustBuildCompilerTargets.cmake
@@ -0,0 +1,191 @@
+#
+# This file defines the `thrust_build_compiler_targets()` function, which
+# creates the following interface targets:
+#
+# thrust.compiler_interface
+# - Interface target providing compiler-specific options needed to build
+#   Thrust's tests, examples, etc.
+#
+# thrust.compiler_interface_cppXX
+# - Interface targets providing compiler-specific options that should only be
+#   applied to certain dialects of C++. May not be defined for all dialects.
+#
+# thrust.promote_cudafe_warnings
+# - Interface target that adds warning promotion for NVCC cudafe invocations.
+# - Only exists to work around github issue #1174 on tbb.cuda configurations.
+# - May be combined with thrust.compiler_interface when #1174 is fully resolved.
+#
+# thrust.silence_unreachable_code_warnings
+# - Interface target that silences unreachable code warnings.
+# - Used to selectively disable such warnings in unit tests caused by
+#   unconditionally thrown exceptions.
+
+function(thrust_build_compiler_targets)
+  set(cxx_compile_definitions)
+  set(cxx_compile_options)
+
+  thrust_update_system_found_flags()
+
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    append_option_if_available("/W4" cxx_compile_options)
+
+    # Treat all warnings as errors. This is only supported on Release builds,
+    # as `nv_exec_check_disable` doesn't seem to work with MSVC debug iterators
+    # and spurious warnings are emitted.
+    # See NVIDIA/thrust#1273, NVBug 3129879.
+    if (CMAKE_BUILD_TYPE STREQUAL "Release")
+      append_option_if_available("/WX" cxx_compile_options)
+    endif()
+
+    # Suppress overly-pedantic/unavoidable warnings brought in with /W4:
+    # C4324: structure was padded due to alignment specifier
+    append_option_if_available("/wd4324" cxx_compile_options)
+    # C4505: unreferenced local function has been removed
+    # The CUDA `host_runtime.h` header emits this for
+    # `__cudaUnregisterBinaryUtil`.
+    append_option_if_available("/wd4505" cxx_compile_options)
+    # C4706: assignment within conditional expression
+    # MSVC doesn't provide an opt-out for this warning when the assignment is
+    # intentional. Clang will warn for these, but suppresses the warning when
+    # double-parentheses are used around the assignment. We'll let Clang catch
+    # unintentional assignments and suppress all such warnings on MSVC.
+    append_option_if_available("/wd4706" cxx_compile_options)
+
+    # Disabled loss-of-data conversion warnings.
+    # TODO Re-enable.
+    append_option_if_available("/wd4244" cxx_compile_options)
+
+    # Disable warning about applying unary operator- to unsigned type.
+    # TODO Re-enable.
+    append_option_if_available("/wd4146" cxx_compile_options)
+
+    # MSVC STL assumes that `allocator_traits`'s allocator will use raw pointers,
+    # and the `__DECLSPEC_ALLOCATOR` macro causes issues with thrust's universal
+    # allocators:
+    #   warning C4494: 'std::allocator_traits<_Alloc>::allocate' :
+    #      Ignoring __declspec(allocator) because the function return type is not
+    #      a pointer or reference
+    # See https://github.com/microsoft/STL/issues/696
+    append_option_if_available("/wd4494" cxx_compile_options)
+
+    # Some of the async tests require /bigobj to fit all their sections into the
+    # object files:
+    append_option_if_available("/bigobj" cxx_compile_options)
+
+    # "Oh right, this is Visual Studio."
+    list(APPEND cxx_compile_definitions "NOMINMAX")
+  else()
+    append_option_if_available("-Werror" cxx_compile_options)
+    append_option_if_available("-Wall" cxx_compile_options)
+    append_option_if_available("-Wextra" cxx_compile_options)
+    append_option_if_available("-Winit-self" cxx_compile_options)
+    append_option_if_available("-Woverloaded-virtual" cxx_compile_options)
+    append_option_if_available("-Wcast-qual" cxx_compile_options)
+    append_option_if_available("-Wpointer-arith" cxx_compile_options)
+    append_option_if_available("-Wunused-local-typedef" cxx_compile_options)
+    append_option_if_available("-Wvla" cxx_compile_options)
+
+    # Disable GNU extensions (flag is clang only)
+    append_option_if_available("-Wgnu" cxx_compile_options)
+    # Calling a variadic macro with zero args is a GNU extension until C++20,
+    # but the THRUST_PP_ARITY macro is used with zero args. Need to see if this
+    # is a real problem worth fixing.
+    append_option_if_available("-Wno-gnu-zero-variadic-macro-arguments" cxx_compile_options)
+
+    # This complains about functions in CUDA system headers when used with nvcc.
+    append_option_if_available("-Wno-unused-function" cxx_compile_options)
+  endif()
+
+  if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 7.3)
+      # GCC 7.3 complains about name mangling changes due to `noexcept`
+      # becoming part of the type system; we don't care.
+      append_option_if_available("-Wno-noexcept-type" cxx_compile_options)
+    endif()
+  endif()
+
+  if ("Intel" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
+    # Disable warning that inlining is inhibited by compiler thresholds.
+    append_option_if_available("-diag-disable=11074" cxx_compile_options)
+    append_option_if_available("-diag-disable=11076" cxx_compile_options)
+  endif()
+
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    # Today:
+    # * NVCC accepts CUDA C++ in .cu files but not .cpp files.
+    # * NVC++ accepts CUDA C++ in .cpp files but not .cu files.
+    # TODO: This won't be necessary in the future.
+    list(APPEND cxx_compile_options -cppsuffix=cu)
+  endif()
+
+  add_library(thrust.compiler_interface INTERFACE)
+
+  foreach (cxx_option IN LISTS cxx_compile_options)
+    target_compile_options(thrust.compiler_interface INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:${cxx_option}>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVCXX>>:${cxx_option}>
+      # Only use -Xcompiler with NVCC, not NVC++.
+      #
+      # CMake can't split genexs, so this can't be formatted better :(
+      # This is:
+      # if (using CUDA and CUDA_COMPILER is NVCC) add -Xcompiler=opt:
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${cxx_option}>
+    )
+  endforeach()
+
+  foreach (cxx_definition IN LISTS cxx_compile_definitions)
+    # Add these for both CUDA and CXX targets:
+    target_compile_definitions(thrust.compiler_interface INTERFACE
+      ${cxx_definition}
+    )
+  endforeach()
+
+  # Display warning numbers from nvcc cudafe errors:
+  target_compile_options(thrust.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--display_error_number>
+  )
+
+  # Tell NVCC to be quiet about deprecated GPU targets:
+  target_compile_options(thrust.compiler_interface INTERFACE
+    # If using CUDA w/ NVCC...
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Wno-deprecated-gpu-targets>
+  )
+
+  # This is kept separate for Github issue #1174.
+  add_library(thrust.promote_cudafe_warnings INTERFACE)
+  target_compile_options(thrust.promote_cudafe_warnings INTERFACE
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcudafe=--promote_warnings>
+  )
+
+  # Some of our unit tests unconditionally throw exceptions, and compilers will
+  # detect that the following instructions are unreachable. This is intentional
+  # and unavoidable in these cases. This target can be used to silence
+  # unreachable code warnings.
+  add_library(thrust.silence_unreachable_code_warnings INTERFACE)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    target_compile_options(thrust.silence_unreachable_code_warnings INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4702>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4702>
+    )
+  endif()
+
+  # These targets are used for dialect-specific options:
+  add_library(thrust.compiler_interface_cpp11 INTERFACE)
+  add_library(thrust.compiler_interface_cpp14 INTERFACE)
+
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+    # C4127: conditional expression is constant
+    # Disable this MSVC warning for C++11/C++14. In C++17, we can use
+    # THRUST_IF_CONSTEXPR to address these warnings.
+    target_compile_options(thrust.compiler_interface_cpp11 INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4127>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4127>
+    )
+    target_compile_options(thrust.compiler_interface_cpp14 INTERFACE
+      $<$<COMPILE_LANGUAGE:CXX>:/wd4127>
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=/wd4127>
+    )
+  endif()
+
+endfunction()
diff --git a/cmake/ThrustBuildTargetList.cmake b/cmake/ThrustBuildTargetList.cmake
new file mode 100644
index 000000000..f4adaf546
--- /dev/null
+++ b/cmake/ThrustBuildTargetList.cmake
@@ -0,0 +1,339 @@
+# This file provides utilities for building and working with thrust
+# configuration targets.
+#
+# THRUST_TARGETS
+#  - Built by the calling the `thrust_build_target_list()` function.
+#  - Each item is the name of a thrust interface target that is configured for a
+#    certain combination of host/device/dialect.
+#
+# thrust_build_target_list()
+# - Creates the THRUST_TARGETS list.
+#
+# The following functions can be used to test/set metadata on a thrust target:
+#
+# thrust_get_target_property(<prop_var> <target_name> <prop>)
+#   - Checks the ${prop} target property on thrust target ${target_name}
+#     and sets the ${prop_var} variable in the caller's scope.
+#   - <prop_var> is any valid cmake identifier.
+#   - <target_name> is the name of a thrust target.
+#   - <prop> is one of the following:
+#     - HOST: The host system. Valid values: CPP, OMP, TBB.
+#     - DEVICE: The device system. Valid values: CUDA, CPP, OMP, TBB.
+#     - DIALECT: The C++ dialect. Valid values: 11, 14, 17, 20.
+#     - PREFIX: A unique prefix that should be used to name all
+#       targets/tests/examples that use this configuration.
+#
+# thrust_get_target_properties(<target_name>)
+#   - Defines ${target_name}_${prop} in the caller's scope, for `prop` in:
+#     HOST, DEVICE, DIALECT, PREFIX. See above for details.
+#
+# thrust_clone_target_properties(<dst_target> <src_target>)
+#   - Set the HOST, DEVICE, DIALECT, PREFIX metadata on ${dst_target} to match
+#     ${src_target}. See above for details.
+#   - This *MUST* be called on any targets that link to another thrust target
+#     to ensure that dialect information is updated correctly, e.g.
+#     `thrust_clone_target_properties(${my_thrust_test} ${some_thrust_target})`
+
+define_property(TARGET PROPERTY _THRUST_HOST
+  BRIEF_DOCS "A target's host system: CPP, TBB, or OMP."
+  FULL_DOCS "A target's host system: CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DEVICE
+  BRIEF_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+  FULL_DOCS "A target's device system: CUDA, CPP, TBB, or OMP."
+)
+define_property(TARGET PROPERTY _THRUST_DIALECT
+  BRIEF_DOCS "A target's C++ dialect: 11, 14, or 17."
+  FULL_DOCS "A target's C++ dialect: 11, 14, or 17."
+)
+define_property(TARGET PROPERTY _THRUST_PREFIX
+  BRIEF_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+  FULL_DOCS "A prefix describing the config, eg. 'thrust.cpp.cuda.cpp14'."
+)
+
+function(thrust_set_target_properties target_name host device dialect prefix)
+  set_target_properties(${target_name}
+    PROPERTIES
+      _THRUST_HOST ${host}
+      _THRUST_DEVICE ${device}
+      _THRUST_DIALECT ${dialect}
+      _THRUST_PREFIX ${prefix}
+  )
+
+  get_property(langs GLOBAL PROPERTY ENABLED_LANGUAGES)
+  set(standard_features)
+  if (CUDA IN_LIST langs)
+    list(APPEND standard_features cuda_std_${dialect})
+  endif()
+  if (CXX IN_LIST langs)
+    list(APPEND standard_features cxx_std_${dialect})
+  endif()
+
+  get_target_property(type ${target_name} TYPE)
+  if (${type} STREQUAL "INTERFACE_LIBRARY")
+    target_compile_features(${target_name} INTERFACE
+      ${standard_features}
+    )
+  else()
+    target_compile_features(${target_name} PUBLIC
+      ${standard_features}
+    )
+    set_target_properties(${target_name}
+      PROPERTIES
+        CXX_STANDARD ${dialect}
+        CUDA_STANDARD ${dialect}
+        # Must manually request that the standards above are actually respected
+        # or else CMake will silently fail to configure the targets correctly...
+        # Note that this doesn't actually work as of CMake 3.16:
+        # https://gitlab.kitware.com/cmake/cmake/-/issues/20953
+        # We'll leave these properties enabled in hopes that they will someday
+        # work.
+        CXX_STANDARD_REQUIRED ON
+        CUDA_STANDARD_REQUIRED ON
+        ARCHIVE_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        LIBRARY_OUTPUT_DIRECTORY "${THRUST_LIBRARY_OUTPUT_DIR}"
+        RUNTIME_OUTPUT_DIRECTORY "${THRUST_EXECUTABLE_OUTPUT_DIR}"
+    )
+
+    # CMake still emits errors about empty CUDA_ARCHITECTURES when CMP0104
+    # is set to OLD. This suppresses the errors for good.
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+      set_target_properties(${target_name}
+        PROPERTIES
+          CUDA_ARCHITECTURES OFF
+      )
+    endif()
+
+    if ("CUDA" STREQUAL "${device}" AND
+        "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+      set_target_properties(${target_name} PROPERTIES
+        CUDA_RESOLVE_DEVICE_SYMBOLS OFF
+      )
+    endif()
+  endif()
+endfunction()
+
+# Get a thrust property from a target and store it in var_name
+# thrust_get_target_property(<var_name> <target_name> [HOST|DEVICE|DIALECT|PREFIX]
+macro(thrust_get_target_property prop_var target_name prop)
+  get_property(${prop_var} TARGET ${target_name} PROPERTY _THRUST_${prop})
+endmacro()
+
+# Defines the following string variables in the caller's scope:
+# - ${target_name}_HOST
+# - ${target_name}_DEVICE
+# - ${target_name}_DIALECT
+# - ${target_name}_PREFIX
+macro(thrust_get_target_properties target_name)
+  thrust_get_target_property(${target_name}_HOST ${target_name} HOST)
+  thrust_get_target_property(${target_name}_DEVICE ${target_name} DEVICE)
+  thrust_get_target_property(${target_name}_DIALECT ${target_name} DIALECT)
+  thrust_get_target_property(${target_name}_PREFIX ${target_name} PREFIX)
+endmacro()
+
+# Set one target's THRUST_* properties to match another target
+function(thrust_clone_target_properties dst_target src_target)
+  thrust_get_target_properties(${src_target})
+  thrust_set_target_properties(${dst_target}
+    ${${src_target}_HOST}
+    ${${src_target}_DEVICE}
+    ${${src_target}_DIALECT}
+    ${${src_target}_PREFIX}
+  )
+endfunction()
+
+# Set ${var_name} to TRUE or FALSE in the caller's scope
+function(_thrust_is_config_valid var_name host device dialect)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_${host} AND
+      THRUST_MULTICONFIG_ENABLE_SYSTEM_${device} AND
+      THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect} AND
+      "${host}_${device}" IN_LIST THRUST_MULTICONFIG_WORKLOAD_${THRUST_MULTICONFIG_WORKLOAD}_CONFIGS)
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_init_target_list)
+  set(THRUST_TARGETS "" CACHE INTERNAL "" FORCE)
+endfunction()
+
+function(_thrust_add_target_to_target_list target_name host device dialect prefix)
+  thrust_set_target_properties(${target_name} ${host} ${device} ${dialect} ${prefix})
+
+  target_link_libraries(${target_name} INTERFACE
+    thrust.compiler_interface
+  )
+
+  # dialect-specific interface:
+  if (TARGET thrust.compiler_interface_cpp${dialect})
+    target_link_libraries(${target_name} INTERFACE
+      thrust.compiler_interface_cpp${dialect}
+    )
+  endif()
+
+  # Workaround Github issue #1174. cudafe promote TBB header warnings to
+  # errors, even when they're -isystem includes.
+  if ((NOT host STREQUAL "TBB") OR (NOT device STREQUAL "CUDA"))
+    target_link_libraries(${target_name} INTERFACE
+      thrust.promote_cudafe_warnings
+    )
+  endif()
+
+  set(THRUST_TARGETS ${THRUST_TARGETS} ${target_name} CACHE INTERNAL "" FORCE)
+
+  set(label "${host}.${device}.cpp${dialect}")
+  string(TOLOWER "${label}" label)
+  message(STATUS "Enabling Thrust configuration: ${label}")
+endfunction()
+
+function(_thrust_build_target_list_multiconfig)
+  # Detect supported dialects if requested -- this must happen after CUDA is
+  # enabled, if it's going to be enabled.
+  if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL OR
+      THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST)
+    message(STATUS "Testing for supported language standards...")
+    include("${CMAKE_CURRENT_FUNCTION_LIST_DIR}/DetectSupportedStandards.cmake")
+    detect_supported_standards(THRUST CXX ${THRUST_CPP_DIALECT_OPTIONS})
+    if (THRUST_CUDA_FOUND)
+      detect_supported_standards(THRUST CUDA ${THRUST_CPP_DIALECT_OPTIONS})
+    endif()
+
+    # Take the union of supported standards in CXX and CUDA:
+    set(supported_dialects)
+    set(latest_dialect 11)
+    foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+      if ((THRUST_CXX_${standard}_SUPPORTED) AND
+          ((NOT THRUST_CUDA_FOUND) OR THRUST_CUDA_${standard}_SUPPORTED))
+
+        # MSVC silently promotes C++11 to C++14 -- skip it:
+        if ((${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) AND (standard EQUAL 11))
+          continue()
+        endif()
+
+        list(APPEND supported_dialects ${standard})
+        if (latest_dialect LESS standard)
+          set(latest_dialect ${standard})
+        endif()
+      endif()
+    endforeach()
+
+    if (THRUST_MULTICONFIG_ENABLE_DIALECT_ALL)
+      foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        if (standard IN_LIST supported_dialects)
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL
+              "Generate C++${dialect} build configurations." FORCE
+          )
+        else()
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        endif()
+      endforeach()
+    elseif(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST)
+      foreach(standard IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        if (standard EQUAL latest_dialect)
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} ON CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        else()
+          set(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${standard} OFF CACHE BOOL
+            "Generate C++${dialect} build configurations." FORCE
+            )
+        endif()
+      endforeach()
+    endif()
+  endif()
+
+  # Supported versions of MSVC do not distinguish between C++11 and C++14.
+  # Warn the user that they may be generating a ton of redundant targets if
+  # they explicitly requested this configuration.
+  if ("MSVC" STREQUAL "${CMAKE_CXX_COMPILER_ID}" AND
+      THRUST_MULTICONFIG_ENABLE_DIALECT_CPP11)
+    message(WARNING
+      "Supported versions of MSVC (2017+) do not distinguish between C++11 "
+      "and C++14. The requested C++11 targets may be redundant."
+    )
+  endif()
+
+  # Build THRUST_TARGETS
+  foreach(host IN LISTS THRUST_HOST_SYSTEM_OPTIONS)
+    foreach(device IN LISTS THRUST_DEVICE_SYSTEM_OPTIONS)
+      foreach(dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+        _thrust_is_config_valid(config_valid ${host} ${device} ${dialect})
+        if (config_valid)
+          set(prefix "thrust.${host}.${device}.cpp${dialect}")
+          string(TOLOWER "${prefix}" prefix)
+
+          # Configure a thrust interface target for this host/device
+          set(target_name "${prefix}")
+          thrust_create_target(${target_name}
+            HOST ${host}
+            DEVICE ${device}
+            ${THRUST_TARGET_FLAGS}
+          )
+
+          # Set configuration metadata for this thrust interface target:
+          _thrust_add_target_to_target_list(${target_name}
+            ${host} ${device} ${dialect} ${prefix}
+          )
+
+          # Create a meta target for all targets in this configuration:
+          add_custom_target(${prefix}.all)
+          add_dependencies(thrust.all ${prefix}.all)
+        endif()
+      endforeach() # dialects
+    endforeach() # devices
+  endforeach() # hosts
+
+  list(LENGTH THRUST_TARGETS count)
+  message(STATUS "${count} unique thrust.host.device.dialect configurations generated")
+endfunction()
+
+function(_thrust_build_target_list_singleconfig)
+  set(host ${THRUST_HOST_SYSTEM})
+  set(device ${THRUST_DEVICE_SYSTEM})
+  set(dialect ${THRUST_CPP_DIALECT})
+  set(prefix "thrust") # single config
+
+  _thrust_add_target_to_target_list(thrust ${host} ${device} ${dialect} ${prefix})
+endfunction()
+
+# Build a ${THRUST_TARGETS} list containing target names for all
+# requested configurations
+function(thrust_build_target_list)
+  # Clear the list of targets:
+  _thrust_init_target_list()
+
+  # Generic config flags:
+  set(THRUST_TARGET_FLAGS)
+  macro(add_flag_option flag docstring default)
+    set(opt "THRUST_${flag}")
+    option(${opt} "${docstring}" "${default}")
+    mark_as_advanced(${opt})
+    if (${${opt}})
+      list(APPEND THRUST_TARGET_FLAGS ${flag})
+    endif()
+  endmacro()
+  add_flag_option(IGNORE_DEPRECATED_CPP_DIALECT "Don't warn about any deprecated C++ standards and compilers." OFF)
+  add_flag_option(IGNORE_DEPRECATED_CPP_11 "Don't warn about deprecated C++11." OFF)
+  add_flag_option(IGNORE_DEPRECATED_COMPILER "Don't warn about deprecated compilers." OFF)
+  add_flag_option(IGNORE_CUB_VERSION_CHECK "Don't warn about mismatched CUB versions." OFF)
+  add_flag_option(IGNORE_DEPRECATED_API "Don't warn about deprecated Thrust or CUB APIs." OFF)
+
+  # Top level meta-target. Makes it easier to just build thrust targets when
+  # building both CUB and Thrust. Add all project files here so IDEs will be
+  # aware of them. This will not generate build rules.
+  file(GLOB_RECURSE all_sources
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    "${Thrust_SOURCE_DIR}/thrust/*.h"
+    "${Thrust_SOURCE_DIR}/thrust/*.inl"
+  )
+  add_custom_target(thrust.all SOURCES ${all_sources})
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    _thrust_build_target_list_multiconfig()
+  else()
+    _thrust_build_target_list_singleconfig()
+  endif()
+endfunction()
diff --git a/cmake/ThrustCompilerHacks.cmake b/cmake/ThrustCompilerHacks.cmake
new file mode 100644
index 000000000..5f7b0d98e
--- /dev/null
+++ b/cmake/ThrustCompilerHacks.cmake
@@ -0,0 +1,110 @@
+# Set up compiler paths and apply temporary hacks to support NVC++.
+# This file must be included before enabling any languages.
+
+# Temporary hacks to make NVC++ work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # If using NVC++, don't set CXX compiler
+  if (NOT "${CMAKE_CXX_COMPILER}" STREQUAL "")
+    unset(CMAKE_CXX_COMPILER CACHE)
+    message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
+      " specified a different ISO C++ compiler; NVC++ acts as both, so please"
+      " unset the CMAKE_CXX_COMPILER variable."
+    )
+  endif()
+
+  # We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
+  # pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
+  # understand.
+  if (NOT "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "")
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR "You are using NVC++ as your CUDA C++ compiler, but have"
+      " specified a different host ISO C++ compiler; NVC++ acts as both, so"
+      " please unset the CMAKE_CUDA_HOST_COMPILER variable."
+    )
+  endif()
+
+  set(CMAKE_CXX_COMPILER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -stdpar")
+  set(CMAKE_CUDA_HOST_LINK_LAUNCHER "${CMAKE_CUDA_COMPILER}")
+  set(CMAKE_CUDA_LINK_EXECUTABLE
+    "<CMAKE_CUDA_HOST_LINK_LAUNCHER> <FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
+
+  # Setup CMAKE_CXX_LIBRARY_ARCHITECTURE on Debian/Ubuntu so that find_package
+  # works properly.
+  if (EXISTS /etc/debian_version)
+    if (NOT CMAKE_CXX_LIBRARY_ARCHITECTURE)
+      file(GLOB files_in_lib RELATIVE /lib /lib/*-linux-gnu* )
+      foreach (file ${files_in_lib})
+        if ("${file}" MATCHES "${CMAKE_LIBRARY_ARCHITECTURE_REGEX}")
+          set(CMAKE_CXX_LIBRARY_ARCHITECTURE ${file})
+          break()
+        endif()
+      endforeach()
+    endif()
+    if (NOT CMAKE_LIBRARY_ARCHITECTURE)
+      set(CMAKE_LIBRARY_ARCHITECTURE ${CMAKE_CXX_LIBRARY_ARCHITECTURE})
+    endif()
+  endif()
+endif()
+
+# We don't set CMAKE_CUDA_HOST_COMPILER for NVC++; if we do, CMake tries to
+# pass `-ccbin ${CMAKE_CUDA_HOST_COMPILER}` to NVC++, which it doesn't
+# understand.
+if ((NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}"))
+  if (NOT ("${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "" OR
+    "${CMAKE_CUDA_HOST_COMPILER}" STREQUAL "${CMAKE_CXX_COMPILER}"))
+    set(tmp "${CMAKE_CUDA_HOST_COMPILER}")
+    unset(CMAKE_CUDA_HOST_COMPILER CACHE)
+    message(FATAL_ERROR
+      "For convenience, Thrust's test harness uses CMAKE_CXX_COMPILER for the "
+      "CUDA host compiler. Refusing to overwrite specified "
+      "CMAKE_CUDA_HOST_COMPILER -- please reconfigure without setting this "
+      "variable. Currently:\n"
+      "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}\n"
+      "CMAKE_CUDA_HOST_COMPILER=${tmp}"
+    )
+  endif ()
+  set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
+endif ()
+
+# Temporary hacks to make NVC++ work; this requires you to define
+# `CMAKE_CUDA_COMPILER_ID=NVCXX and `CMAKE_CUDA_COMPILER_FORCED=ON`.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  # Need 3.17 for the properties used below.
+  cmake_minimum_required(VERSION 3.17)
+
+  set(CMAKE_CUDA_STANDARD_DEFAULT 03)
+
+  set(CMAKE_CUDA03_STANDARD_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_EXTENSION_COMPILE_OPTION "-std=c++03")
+  set(CMAKE_CUDA03_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA03_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA11_STANDARD_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_EXTENSION_COMPILE_OPTION "-std=c++11")
+  set(CMAKE_CUDA11_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA11_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA14_STANDARD_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_EXTENSION_COMPILE_OPTION "-std=c++14")
+  set(CMAKE_CUDA14_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA14_KNOWN_FEATURES)
+
+  set(CMAKE_CUDA17_STANDARD_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "-std=c++17")
+  set(CMAKE_CUDA17_STANDARD__HAS_FULL_SUPPORT TRUE)
+  set_property(GLOBAL PROPERTY CMAKE_CUDA17_KNOWN_FEATURES)
+
+  include(Internal/FeatureTesting)
+  include(Compiler/CMakeCommonCompilerMacros)
+  cmake_record_cuda_compile_features()
+
+  set(CMAKE_CUDA_COMPILE_FEATURES
+    ${CMAKE_CUDA03_COMPILE_FEATURES}
+    ${CMAKE_CUDA11_COMPILE_FEATURES}
+    ${CMAKE_CUDA14_COMPILE_FEATURES}
+    ${CMAKE_CUDA17_COMPILE_FEATURES}
+    ${CMAKE_CUDA20_COMPILE_FEATURES}
+  )
+endif()
diff --git a/cmake/ThrustCudaConfig.cmake b/cmake/ThrustCudaConfig.cmake
new file mode 100644
index 000000000..a585c7910
--- /dev/null
+++ b/cmake/ThrustCudaConfig.cmake
@@ -0,0 +1,200 @@
+enable_language(CUDA)
+
+set(THRUST_KNOWN_COMPUTE_ARCHS 50 52 53 60 61 62 70 72 75 80 86)
+
+if (NVIDIA STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER 11.7)
+    list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 90)
+  endif()
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.0)
+    list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37)
+  endif()
+else()
+  list(APPEND THRUST_KNOWN_COMPUTE_ARCHS 35 37 90)
+endif()
+
+# Split CUDA_FLAGS into 3 parts:
+#
+# THRUST_CUDA_FLAGS_BASE: Common CUDA flags for all targets.
+# THRUST_CUDA_FLAGS_RDC: Additional CUDA flags for targets compiled with RDC.
+# THRUST_CUDA_FLAGS_NO_RDC: Additional CUDA flags for targets compiled without RDC.
+#
+# This is necessary because CUDA SMs 5.3, 6.2, and 7.2 do not support RDC, but
+# we want to always build some targets (e.g. testing/cuda/*) with RDC.
+# We work around this by building the "always RDC" targets without support for
+# those SMs. This requires two sets of CUDA_FLAGS.
+#
+# Enabling any of those SMs along with the ENABLE_RDC options will result in a
+# configuration error.
+#
+# Because of how CMake handles the CMAKE_CUDA_FLAGS variables, every target
+# generated in a given directory will use the same value for CMAKE_CUDA_FLAGS,
+# which is determined at the end of the directory's scope. This means caution
+# should be used when trying to build different targets with different flags,
+# since they might not behave as expected. This will improve with CMake 3.18,
+# which add the DEVICE_LINK genex, fixing the issue with using per-target
+# CUDA_FLAGS: https://gitlab.kitware.com/cmake/cmake/-/issues/18265
+set(THRUST_CUDA_FLAGS_BASE "${CMAKE_CUDA_FLAGS}")
+set(THRUST_CUDA_FLAGS_RDC)
+set(THRUST_CUDA_FLAGS_NO_RDC)
+
+# Archs that don't support RDC:
+set(no_rdc_archs 53 62 72)
+
+# Find the highest arch:
+list(SORT THRUST_KNOWN_COMPUTE_ARCHS)
+list(LENGTH THRUST_KNOWN_COMPUTE_ARCHS max_idx)
+math(EXPR max_idx "${max_idx} - 1")
+list(GET THRUST_KNOWN_COMPUTE_ARCHS ${max_idx} highest_arch)
+
+option(THRUST_AUTO_DETECT_COMPUTE_ARCHS
+  "If ON, compute architectures for all GPUs in the current system are enabled and all other compute architectures are disabled."
+  OFF
+)
+
+if (THRUST_AUTO_DETECT_COMPUTE_ARCHS)
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    message(STATUS "Thrust: Using NVC++ builtin automatic compute architecture detection.")
+  else()
+    set(detect_compute_archs_source ${Thrust_SOURCE_DIR}/cmake/detect_compute_archs.cu)
+    set(detect_compute_archs_exe ${PROJECT_BINARY_DIR}/detect_compute_archs)
+    set(detect_compute_archs_error_log ${PROJECT_BINARY_DIR}/detect_compute_archs.stderr.log)
+    execute_process(
+      COMMAND ${CMAKE_CUDA_COMPILER}
+        -std=c++11
+        -o ${detect_compute_archs_exe}
+        --run
+        ${detect_compute_archs_source}
+      OUTPUT_VARIABLE detected_archs
+      OUTPUT_STRIP_TRAILING_WHITESPACE
+      ERROR_FILE ${detect_compute_archs_error_log})
+    if ("NONE" STREQUAL "${detected_archs}")
+      set(detected_message " none")
+    else()
+      foreach (arch IN LISTS detected_archs)
+        string(APPEND detected_message " sm_${arch}")
+      endforeach()
+    endif()
+    message(STATUS "Thrust: Automatically detected compute architectures:${detected_message}")
+  endif()
+endif()
+
+set(option_init OFF)
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+option(THRUST_DISABLE_ARCH_BY_DEFAULT
+  "If ON, then all compute architectures are disabled on the initial CMake run."
+  ${option_init}
+)
+
+set(option_init ON)
+if (THRUST_DISABLE_ARCH_BY_DEFAULT OR THRUST_AUTO_DETECT_COMPUTE_ARCHS)
+  set(option_init OFF)
+endif()
+
+set(num_archs_enabled 0)
+foreach (arch IN LISTS THRUST_KNOWN_COMPUTE_ARCHS)
+  set(this_option_init ${option_init})
+
+  if (${arch} IN_LIST detected_archs)
+    set(this_option_init ON)
+  endif()
+
+  option(THRUST_ENABLE_COMPUTE_${arch}
+    "Enable code generation for tests for sm_${arch}"
+    ${this_option_init}
+  )
+
+  if (NOT THRUST_ENABLE_COMPUTE_${arch})
+    continue()
+  endif()
+
+  math(EXPR num_archs_enabled "${num_archs_enabled} + 1")
+
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    if (NOT ${num_archs_enabled} EQUAL 1)
+      message(FATAL_ERROR
+        "NVCXX does not support compilation for multiple device architectures "
+        "at once."
+      )
+    endif()
+    set(arch_flag "-gpu=cc${arch}")
+  elseif ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    set(arch_flag "--cuda-gpu-arch=sm_${arch}")
+  else()
+    set(arch_flag "-gencode arch=compute_${arch},code=sm_${arch}")
+  endif()
+
+  string(APPEND compute_message " sm_${arch}")
+  string(APPEND THRUST_CUDA_FLAGS_NO_RDC " ${arch_flag}")
+  if (NOT arch IN_LIST no_rdc_archs)
+    string(APPEND THRUST_CUDA_FLAGS_RDC " ${arch_flag}")
+  endif()
+endforeach()
+
+if (NOT "NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  option(THRUST_ENABLE_COMPUTE_FUTURE
+    "Enable code generation for tests for compute_${highest_arch}"
+    ${option_init}
+  )
+  if (THRUST_ENABLE_COMPUTE_FUTURE)
+    string(APPEND THRUST_CUDA_FLAGS_BASE
+      " -gencode arch=compute_${highest_arch},code=compute_${highest_arch}"
+    )
+    string(APPEND compute_message " compute_${highest_arch}")
+  endif()
+endif()
+
+message(STATUS "Thrust: Explicitly enabled compute architectures:${compute_message}")
+
+# RDC is off by default in NVCC and on by default in NVC++. Turning off RDC
+# isn't currently supported by NVC++. So, we default to RDC off for NVCC and
+# RDC on for NVC++.
+set(option_init OFF)
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(option_init ON)
+endif()
+
+option(THRUST_ENABLE_TESTS_WITH_RDC
+  "Build all Thrust tests with RDC; tests that require RDC are not affected by this option."
+  ${option_init}
+)
+
+option(THRUST_ENABLE_EXAMPLES_WITH_RDC
+  "Build all Thrust examples with RDC; examples which require RDC are not affected by this option."
+  ${option_init}
+)
+
+# Check for RDC/SM compatibility and error/warn if necessary
+foreach (sm IN LISTS no_rdc_archs)
+  set(sm_opt THRUST_ENABLE_COMPUTE_${sm})
+  if (${sm_opt})
+    foreach (opt IN ITEMS TESTS EXAMPLES)
+      set(rdc_opt THRUST_ENABLE_${opt}_WITH_RDC)
+      if (${rdc_opt})
+        message(FATAL_ERROR
+          "${rdc_opt} is incompatible with ${sm_opt}, since sm_${sm} does not "
+          "support RDC."
+        )
+      endif()
+    endforeach()
+
+    message(NOTICE
+      "sm_${sm} does not support RDC. Targets that require RDC will be built "
+      "without support for this architecture."
+    )
+  endif()
+endforeach()
+
+
+# 
+# Clang CUDA options 
+#
+if ("Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(THRUST_CUDA_FLAGS_BASE "${THRUST_CUDA_FLAGS_BASE} -Wno-unknown-cuda-version -Xclang=-fcuda-allow-variadic-functions")
+endif()
+
+
+# By default RDC is not used:
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
diff --git a/cmake/ThrustFindThrust.cmake b/cmake/ThrustFindThrust.cmake
new file mode 100644
index 000000000..39a79e4b7
--- /dev/null
+++ b/cmake/ThrustFindThrust.cmake
@@ -0,0 +1,42 @@
+function(_thrust_find_thrust_multiconfig)
+  # Check which systems are enabled by multiconfig:
+  set(req_systems)
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+    list(APPEND req_systems CUDA)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP)
+    list(APPEND req_systems CPP)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB)
+    list(APPEND req_systems TBB)
+  endif()
+  if (THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP)
+    list(APPEND req_systems OMP)
+  endif()
+
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+    COMPONENTS ${req_systems}
+  )
+endfunction()
+
+function(_thrust_find_thrust_singleconfig)
+  find_package(Thrust REQUIRED CONFIG
+    NO_DEFAULT_PATH # Only check the explicit path in HINTS:
+    HINTS "${Thrust_SOURCE_DIR}"
+  )
+  # Create target now to prepare system found flags:
+  thrust_create_target(thrust FROM_OPTIONS ${THRUST_TARGET_FLAGS})
+  thrust_debug_target(thrust "${THRUST_VERSION}")
+endfunction()
+
+# Build a ${THRUST_TARGETS} list containing target names for all
+# requested configurations
+function(thrust_find_thrust)
+  if (THRUST_ENABLE_MULTICONFIG)
+    _thrust_find_thrust_multiconfig()
+  else()
+    _thrust_find_thrust_singleconfig()
+  endif()
+endfunction()
diff --git a/cmake/ThrustHeaderTesting.cmake b/cmake/ThrustHeaderTesting.cmake
new file mode 100644
index 000000000..3b3e00ca8
--- /dev/null
+++ b/cmake/ThrustHeaderTesting.cmake
@@ -0,0 +1,140 @@
+# For every public header, build a translation unit containing `#include <header>`
+# to let the compiler try to figure out warnings in that header if it is not otherwise
+# included in tests, and also to verify if the headers are modular enough.
+# .inl files are not globbed for, because they are not supposed to be used as public
+# entrypoints.
+
+# Meta target for all configs' header builds:
+add_custom_target(thrust.all.headers)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_systems ${config_host} ${config_device})
+
+  string(TOLOWER "${config_host}" host_lower)
+  string(TOLOWER "${config_device}" device_lower)
+
+  # GLOB ALL THE THINGS
+  set(headers_globs thrust/*.h)
+  set(headers_exclude_systems_globs thrust/system/*/*)
+  set(headers_systems_globs
+    thrust/system/${host_lower}/*
+    thrust/system/${device_lower}/*
+  )
+  set(headers_exclude_details_globs
+    thrust/detail/*
+    thrust/*/detail/*
+    thrust/*/*/detail/*
+  )
+
+  # Get all .h files...
+  file(GLOB_RECURSE headers
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_globs}
+  )
+
+  # ...then remove all system specific headers...
+  file(GLOB_RECURSE headers_exclude_systems
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_systems_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_systems})
+
+  # ...then add all headers specific to the selected host and device systems back again...
+  file(GLOB_RECURSE headers_systems
+    RELATIVE ${Thrust_SOURCE_DIR}/thrust
+    CONFIGURE_DEPENDS
+    ${headers_systems_globs}
+  )
+  list(APPEND headers ${headers_systems})
+
+  # ...and remove all the detail headers (also removing the detail headers from the selected systems).
+  file(GLOB_RECURSE headers_exclude_details
+    RELATIVE "${Thrust_SOURCE_DIR}/thrust"
+    CONFIGURE_DEPENDS
+    ${headers_exclude_details_globs}
+  )
+  list(REMOVE_ITEM headers ${headers_exclude_details})
+
+  # List of headers that aren't implemented for all backends, but are implemented for CUDA.
+  set(partially_implemented_CUDA
+    async/copy.h
+    async/for_each.h
+    async/reduce.h
+    async/scan.h
+    async/sort.h
+    async/transform.h
+    event.h
+    future.h
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for CPP.
+  set(partially_implemented_CPP
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for TBB.
+  set(partially_implemented_TBB
+  )
+
+  # List of headers that aren't implemented for all backends, but are implemented for OMP.
+  set(partially_implemented_OMP
+  )
+
+  # List of all partially implemented headers.
+  set(partially_implemented
+    ${partially_implemented_CUDA}
+    ${partially_implemented_CPP}
+    ${partially_implemented_TBB}
+    ${partially_implemented_OMP}
+  )
+  list(REMOVE_DUPLICATES partially_implemented)
+
+  set(headertest_srcs)
+
+  foreach (header IN LISTS headers)
+    if ("${header}" IN_LIST partially_implemented)
+      # This header is partially implemented on _some_ backends...
+      if (NOT "${header}" IN_LIST partially_implemented_${config_device})
+        # ...but not on the selected one.
+        continue()
+      endif()
+    endif()
+
+    set(headertest_src_ext .cpp)
+    if ("CUDA" STREQUAL "${config_device}")
+      set(headertest_src_ext .cu)
+    endif()
+
+    set(headertest_src "headers/${config_prefix}/${header}${headertest_src_ext}")
+    configure_file("${Thrust_SOURCE_DIR}/cmake/header_test.in" "${headertest_src}")
+
+    list(APPEND headertest_srcs "${headertest_src}")
+  endforeach()
+
+  set(headertest_target ${config_prefix}.headers)
+  add_library(${headertest_target} OBJECT ${headertest_srcs})
+  target_link_libraries(${headertest_target} PUBLIC ${thrust_target})
+  # Wrap Thrust/CUB in a custom namespace to check proper use of ns macros:
+  target_compile_definitions(${headertest_target} PRIVATE
+    "THRUST_WRAPPED_NAMESPACE=wrapped_thrust"
+    "CUB_WRAPPED_NAMESPACE=wrapped_cub"
+  )
+  thrust_clone_target_properties(${headertest_target} ${thrust_target})
+
+  # Disable macro checks on TBB; the TBB atomic implementation uses `I` and
+  # our checks will issue false errors.
+  if ("TBB" IN_LIST config_systems)
+    target_compile_definitions(${headertest_target}
+      PRIVATE THRUST_IGNORE_MACRO_CHECKS
+    )
+  endif()
+
+  thrust_fix_clang_nvcc_build_for(${headertest_target})
+
+  add_dependencies(thrust.all.headers ${headertest_target})
+  add_dependencies(${config_prefix}.all ${headertest_target})
+endforeach()
diff --git a/cmake/ThrustInstallRules.cmake b/cmake/ThrustInstallRules.cmake
new file mode 100644
index 000000000..98e72e196
--- /dev/null
+++ b/cmake/ThrustInstallRules.cmake
@@ -0,0 +1,58 @@
+# Bring in CMAKE_INSTALL_LIBDIR
+include(GNUInstallDirs)
+
+# Thrust is a header library; no need to build anything before installing:
+set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY TRUE)
+
+install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
+  DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+  FILES_MATCHING
+    PATTERN "*.h"
+    PATTERN "*.inl"
+)
+
+install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake/"
+  DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/thrust"
+  PATTERN *.cmake.in EXCLUDE
+)
+# Need to configure a file to store the infix specified in
+# CMAKE_INSTALL_INCLUDEDIR since it can be defined by the user
+set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/thrust")
+configure_file("${Thrust_SOURCE_DIR}/thrust/cmake/thrust-header-search.cmake.in"
+  "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  @ONLY)
+install(FILES "${Thrust_BINARY_DIR}/thrust/cmake/thrust-header-search.cmake"
+  DESTINATION "${install_location}")
+
+# Depending on how Thrust is configured, libcudacxx and CUB's CMake scripts may
+# or may not be include()'d, so force include their install rules when requested.
+# By default, these projects are installed alongside Thrust. This is controlled by
+# THRUST_INSTALL_CUB_HEADERS and THRUST_INSTALL_LIBCUDACXX_HEADERS.
+option(THRUST_INSTALL_CUB_HEADERS "Include CUB headers when installing." ON)
+if (THRUST_INSTALL_CUB_HEADERS)
+  # Use a function to limit scope of the CUB_*_DIR vars:
+  function(_thrust_install_cub_headers)
+    # Fake these for the logic in CUBInstallRules.cmake:
+    set(CUB_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/cub/")
+    set(CUB_BINARY_DIR "${Thrust_BINARY_DIR}/cub-config/")
+    set(CUB_ENABLE_INSTALL_RULES ON)
+    set(CUB_IN_THRUST OFF)
+    include("${Thrust_SOURCE_DIR}/dependencies/cub/cmake/CubInstallRules.cmake")
+  endfunction()
+
+  _thrust_install_cub_headers()
+endif()
+
+option(THRUST_INSTALL_LIBCUDACXX_HEADERS "Include libcudacxx headers when installing." ON)
+if (THRUST_INSTALL_LIBCUDACXX_HEADERS)
+  # Use a function to limit scope of the libcudacxx_*_DIR vars:
+  function(_thrust_install_libcudacxx_headers)
+    # Fake these for the logic in libcudacxxInstallRules.cmake:
+    set(libcudacxx_SOURCE_DIR "${Thrust_SOURCE_DIR}/dependencies/libcudacxx/")
+    set(libcudacxx_BINARY_DIR "${Thrust_BINARY_DIR}/libcudacxx-config/")
+    set(libcudacxx_ENABLE_INSTALL_RULES ON)
+    include("${Thrust_SOURCE_DIR}/dependencies/libcudacxx/cmake/libcudacxxInstallRules.cmake")
+  endfunction()
+
+  _thrust_install_libcudacxx_headers()
+endif()
diff --git a/cmake/ThrustMultiConfig.cmake b/cmake/ThrustMultiConfig.cmake
new file mode 100644
index 000000000..aa9fc0226
--- /dev/null
+++ b/cmake/ThrustMultiConfig.cmake
@@ -0,0 +1,129 @@
+# This file defines thrust_configure_multiconfig(), which sets up and handles
+# the MultiConfig options that allow multiple host/device/dialect configurations
+# to be generated from a single thrust build.
+
+function(thrust_configure_multiconfig)
+  option(THRUST_ENABLE_MULTICONFIG "Enable multiconfig options for coverage testing." OFF)
+
+  # Dialects:
+  set(THRUST_CPP_DIALECT_OPTIONS
+    11 14 17 20
+    CACHE INTERNAL "C++ dialects supported by Thrust." FORCE
+  )
+
+  if (THRUST_ENABLE_MULTICONFIG)
+    # Handle dialect options:
+    foreach (dialect IN LISTS THRUST_CPP_DIALECT_OPTIONS)
+      set(default_value OFF)
+      if (dialect EQUAL 14) # Default to just 14 on:
+        set(default_value ON)
+      endif()
+      option(THRUST_MULTICONFIG_ENABLE_DIALECT_CPP${dialect}
+        "Generate C++${dialect} build configurations."
+        ${default_value}
+      )
+    endforeach()
+
+    # Option to enable all standards supported by the CUDA and CXX compilers:
+    option(THRUST_MULTICONFIG_ENABLE_DIALECT_ALL
+      "Generate build configurations for all C++ standards supported by the configured compilers."
+      OFF
+    )
+
+    # Option to enable only the most recent supported dialect:
+    option(THRUST_MULTICONFIG_ENABLE_DIALECT_LATEST
+      "Generate a single build configuration for the most recent C++ standard supported by the configured compilers."
+      OFF
+    )
+
+    # Systems:
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CPP "Generate build configurations that use CPP." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA "Generate build configurations that use CUDA." ON)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_OMP "Generate build configurations that use OpenMP." OFF)
+    option(THRUST_MULTICONFIG_ENABLE_SYSTEM_TBB "Generate build configurations that use TBB." OFF)
+
+    # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3:
+    if (THRUST_MULTICONFIG_ENABLE_DIALECT_CPP17 AND
+        THRUST_MULTICONFIG_ENABLE_SYSTEM_CUDA)
+      cmake_minimum_required(VERSION 3.18.3)
+    endif()
+
+    # Workload:
+    # - `SMALL`: [3 configs] Minimal coverage and validation of each device system against the `CPP` host.
+    # - `MEDIUM`: [6 configs] Cheap extended coverage.
+    # - `LARGE`: [8 configs] Expensive extended coverage. Include all useful build configurations.
+    # - `FULL`: [12 configs] The complete cross product of all possible build configurations.
+    #
+    # Config   | Workloads | Value      | Expense   | Note
+    # ---------|-----------|------------|-----------|-----------------------------
+    # CPP/CUDA | F L M S   | Essential  | Expensive | Validates CUDA against CPP
+    # CPP/OMP  | F L M S   | Essential  | Cheap     | Validates OMP against CPP
+    # CPP/TBB  | F L M S   | Essential  | Cheap     | Validates TBB against CPP
+    # CPP/CPP  | F L M     | Important  | Cheap     | Tests CPP as device
+    # OMP/OMP  | F L M     | Important  | Cheap     | Tests OMP as host
+    # TBB/TBB  | F L M     | Important  | Cheap     | Tests TBB as host
+    # TBB/CUDA | F L       | Important  | Expensive | Validates TBB/CUDA interop
+    # OMP/CUDA | F L       | Important  | Expensive | Validates OMP/CUDA interop
+    # TBB/OMP  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # OMP/TBB  | F         | Not useful | Cheap     | Mixes CPU-parallel systems
+    # TBB/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+    # OMP/CPP  | F         | Not Useful | Cheap     | Parallel host, serial device
+
+    set(THRUST_MULTICONFIG_WORKLOAD SMALL CACHE STRING
+      "Limit host/device configs: SMALL (up to 3 h/d combos per dialect), MEDIUM(6), LARGE(8), FULL(12)"
+    )
+    set_property(CACHE THRUST_MULTICONFIG_WORKLOAD PROPERTY STRINGS
+      SMALL MEDIUM LARGE FULL
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS
+      CPP_OMP CPP_TBB CPP_CUDA
+      CACHE INTERNAL "Host/device combos enabled for SMALL workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_SMALL_CONFIGS}
+      CPP_CPP TBB_TBB OMP_OMP
+      CACHE INTERNAL "Host/device combos enabled for MEDIUM workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_MEDIUM_CONFIGS}
+      OMP_CUDA TBB_CUDA
+      CACHE INTERNAL "Host/device combos enabled for LARGE workloads." FORCE
+    )
+    set(THRUST_MULTICONFIG_WORKLOAD_FULL_CONFIGS
+      ${THRUST_MULTICONFIG_WORKLOAD_LARGE_CONFIGS}
+      OMP_CPP TBB_CPP OMP_TBB TBB_OMP
+      CACHE INTERNAL "Host/device combos enabled for FULL workloads." FORCE
+    )
+
+    # Hide the single config options if they exist from a previous run:
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE INTERNAL)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE INTERNAL)
+    endif()
+    if (DEFINED THRUST_CPP_DIALECT)
+      set_property(CACHE THRUST_CPP_DIALECT PROPERTY TYPE INTERNAL)
+    endif()
+
+  else() # Single config:
+    # Restore system option visibility if these cache options already exist
+    # from a previous run.
+    if (DEFINED THRUST_HOST_SYSTEM)
+      set_property(CACHE THRUST_HOST_SYSTEM PROPERTY TYPE STRING)
+      set_property(CACHE THRUST_DEVICE_SYSTEM PROPERTY TYPE STRING)
+    endif()
+
+    set(THRUST_CPP_DIALECT 14
+      CACHE STRING "The C++ standard to target: ${THRUST_CPP_DIALECT_OPTIONS}"
+    )
+    set_property(CACHE THRUST_CPP_DIALECT
+      PROPERTY STRINGS
+      ${THRUST_CPP_DIALECT_OPTIONS}
+    )
+
+    # CMake fixed C++17 support for NVCC + MSVC targets in 3.18.3:
+    if (THRUST_CPP_DIALECT EQUAL 17 AND
+        THRUST_DEVICE_SYSTEM STREQUAL "CUDA")
+      cmake_minimum_required(VERSION 3.18.3)
+    endif()
+  endif()
+endfunction()
diff --git a/cmake/ThrustRunExample.cmake b/cmake/ThrustRunExample.cmake
new file mode 100644
index 000000000..24e9dd2bb
--- /dev/null
+++ b/cmake/ThrustRunExample.cmake
@@ -0,0 +1,49 @@
+# Inputs:
+#
+# Variable             | Type     | Doc
+# ---------------------|----------|--------------------------------------
+# EXAMPLE_EXECUTABLE   | FilePath | Path to example executable
+# FILECHECK_ENABLED    | Boolean  | Run FileCheck comparison test
+# FILECHECK_EXECUTABLE | FilePath | Path to the LLVM FileCheck utility
+# REFERENCE_FILE       | FilePath | Path to the FileCheck reference file
+
+if (FILECHECK_ENABLED)
+  if (NOT EXISTS "${REFERENCE_FILE}")
+    message(FATAL_ERROR
+      "FileCheck requested for '${EXAMPLE_EXECUTABLE}', but reference file "
+      "does not exist at '${REFERENCE_FILE}`."
+    )
+  endif()
+
+  # If the reference file is empty, validate that the example doesn't
+  # produce any output.
+  file(SIZE "${REFERENCE_FILE}" file_size)
+  message("${REFERENCE_FILE}: ${file_size} bytes")
+
+  if (file_size EQUAL 0)
+    set(check_empty_output TRUE)
+    set(filecheck_command)
+  else()
+    set(check_empty_output FALSE)
+    set(filecheck_command COMMAND "${FILECHECK_EXECUTABLE}" "${REFERENCE_FILE}")
+  endif()
+endif()
+
+execute_process(
+  COMMAND "${EXAMPLE_EXECUTABLE}"
+  ${filecheck_command}
+  RESULT_VARIABLE exit_code
+  OUTPUT_VARIABLE stdout
+  ERROR_VARIABLE stderr
+)
+
+if (NOT 0 EQUAL exit_code)
+  message(FATAL_ERROR "${EXAMPLE_EXECUTABLE} failed (${exit_code}):\n${stderr}")
+endif()
+
+if (check_empty_output)
+  string(LENGTH "${stdout}" stdout_size)
+  if (NOT stdout_size EQUAL 0)
+    message(FATAL_ERROR "${EXAMPLE_EXECUTABLE}: output received, but not expected:\n${stdout}")
+  endif()
+endif()
diff --git a/cmake/ThrustRunTest.cmake b/cmake/ThrustRunTest.cmake
new file mode 100644
index 000000000..0d03129f0
--- /dev/null
+++ b/cmake/ThrustRunTest.cmake
@@ -0,0 +1,8 @@
+execute_process(
+  COMMAND "${THRUST_BINARY}"
+  RESULT_VARIABLE EXIT_CODE
+)
+
+if (NOT "0" STREQUAL "${EXIT_CODE}")
+    message(FATAL_ERROR "${THRUST_BINARY} failed (${EXIT_CODE})")
+endif ()
diff --git a/cmake/ThrustUtilities.cmake b/cmake/ThrustUtilities.cmake
new file mode 100644
index 000000000..6bbb1200a
--- /dev/null
+++ b/cmake/ThrustUtilities.cmake
@@ -0,0 +1,25 @@
+# Given a cu_file (e.g. foo/bar.cu) relative to CMAKE_CURRENT_SOURCE_DIR
+# and a thrust_target, create a cpp file that includes the .cu file, and set
+# ${cpp_file_var} in the parent scope to the full path of the new file. The new
+# file will be generated in:
+# ${CMAKE_CURRENT_BINARY_DIR}/<thrust_target_prefix>/${cu_file}.cpp
+function(thrust_wrap_cu_in_cpp cpp_file_var cu_file thrust_target)
+  thrust_get_target_property(prefix ${thrust_target} PREFIX)
+  set(wrapped_source_file "${CMAKE_CURRENT_SOURCE_DIR}/${cu_file}")
+  set(cpp_file "${CMAKE_CURRENT_BINARY_DIR}/${prefix}/${cu_file}.cpp")
+  configure_file("${Thrust_SOURCE_DIR}/cmake/wrap_source_file.cpp.in" "${cpp_file}")
+  set(${cpp_file_var} "${cpp_file}" PARENT_SCOPE)
+endfunction()
+
+# Enable RDC for a CUDA target. Encapsulates compiler hacks:
+function(thrust_enable_rdc_for_cuda_target target_name)
+  if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    set_target_properties(${target_name} PROPERTIES
+      COMPILE_FLAGS "-gpu=rdc"
+    )
+  else()
+    set_target_properties(${target_name} PROPERTIES
+      CUDA_SEPARABLE_COMPILATION ON
+    )
+  endif()
+endfunction()
diff --git a/cmake/detect_compute_archs.cu b/cmake/detect_compute_archs.cu
new file mode 100644
index 000000000..1d30dca4b
--- /dev/null
+++ b/cmake/detect_compute_archs.cu
@@ -0,0 +1,43 @@
+/*
+ *  Copyright 2019-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <cstdio>
+#include <set>
+#include <string>
+
+int main(int argc, char** argv) {
+  std::set<std::string> archs;
+  int devices;
+  if ((cudaGetDeviceCount(&devices) == cudaSuccess) && (devices > 0)) {
+    for (int dev = 0; dev < devices; ++dev) {
+      char buff[32];
+      cudaDeviceProp prop;
+      if(cudaGetDeviceProperties(&prop, dev) != cudaSuccess) continue;
+      sprintf(buff, "%d%d", prop.major, prop.minor);
+      archs.insert(buff);
+    }
+  }
+  if (archs.empty()) {
+    printf("NONE");
+  } else {
+    bool first = true;
+    for(const auto& arch : archs) {
+      printf(first ? "%s" : ";%s", arch.c_str());
+      first = false;
+    }
+  }
+  printf("\n");
+}
diff --git a/cmake/filecheck_smoke_test b/cmake/filecheck_smoke_test
new file mode 100644
index 000000000..aad1b0fd1
--- /dev/null
+++ b/cmake/filecheck_smoke_test
@@ -0,0 +1 @@
+SMOKE
diff --git a/cmake/header_test.in b/cmake/header_test.in
new file mode 100644
index 000000000..250dd5170
--- /dev/null
+++ b/cmake/header_test.in
@@ -0,0 +1,61 @@
+// This source file checks that:
+// 1) Header <thrust/${header}> compiles without error.
+// 2) Common macro collisions with platform/system headers are avoided.
+
+// Turn off failures for certain configurations:
+#define THRUST_CPP11_REQUIRED_NO_ERROR
+#define THRUST_CPP14_REQUIRED_NO_ERROR
+#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR
+
+#ifndef THRUST_IGNORE_MACRO_CHECKS
+
+// Define THRUST_MACRO_CHECK(macro, header), which emits a diagnostic indicating
+// a potential macro collision and halts.
+//
+// Hacky way to build a string, but it works on all tested platforms.
+#define THRUST_MACRO_CHECK(MACRO, HEADER)                                      \
+  THRUST_MACRO_CHECK_IMPL(Identifier MACRO should not be used from Thrust      \
+                          headers due to conflicts with HEADER macros.)
+
+// Use raw platform checks instead of the THRUST_HOST_COMPILER macros since we
+// don't want to #include any headers other than the one being tested.
+//
+// This is only implemented for MSVC/GCC/Clang.
+#if defined(_MSC_VER) // MSVC
+
+// Fake up an error for MSVC
+#define THRUST_MACRO_CHECK_IMPL(msg)                                           \
+  /* Print message that looks like an error: */                                \
+  __pragma(message(__FILE__ ":" THRUST_MACRO_CHECK_IMPL0(__LINE__)             \
+                   ": error: " #msg))                                          \
+  /* abort compilation due to static_assert or syntax error: */                \
+  static_assert(false, #msg);
+#define THRUST_MACRO_CHECK_IMPL0(x) THRUST_MACRO_CHECK_IMPL1(x)
+#define THRUST_MACRO_CHECK_IMPL1(x) #x
+
+#elif defined(__clang__) || defined(__GNUC__)
+
+// GCC/clang are easy:
+#define THRUST_MACRO_CHECK_IMPL(msg) THRUST_MACRO_CHECK_IMPL0(GCC error #msg)
+#define THRUST_MACRO_CHECK_IMPL0(expr) _Pragma(#expr)
+
+#endif
+
+// complex.h conflicts
+#define I THRUST_MACRO_CHECK('I', complex.h)
+
+// windows.h conflicts
+#define small THRUST_MACRO_CHECK('small', windows.h)
+// We can't enable these checks without breaking some builds -- some standard
+// library implementations unconditionally `#undef` these macros, which then
+// causes random failures later.
+// Leaving these commented out as a warning: Here be dragons.
+//#define min(...) THRUST_MACRO_CHECK('min', windows.h)
+//#define max(...) THRUST_MACRO_CHECK('max', windows.h)
+
+// termios.h conflicts (NVIDIA/thrust#1547)
+#define B0 THRUST_MACRO_CHECK("B0", termios.h)
+
+#endif // THRUST_IGNORE_MACRO_CHECKS
+
+#include <thrust/${header}>
diff --git a/cmake/wrap_source_file.cpp.in b/cmake/wrap_source_file.cpp.in
new file mode 100644
index 000000000..3015238cc
--- /dev/null
+++ b/cmake/wrap_source_file.cpp.in
@@ -0,0 +1 @@
+#include <${wrapped_source_file}>
diff --git a/dependencies/cub b/dependencies/cub
new file mode 160000
index 000000000..b2e8bccb8
--- /dev/null
+++ b/dependencies/cub
@@ -0,0 +1 @@
+Subproject commit b2e8bccb8c0cd15279974fe4b9b8d6fcd1842b57
diff --git a/dependencies/libcudacxx b/dependencies/libcudacxx
new file mode 160000
index 000000000..55dd2c993
--- /dev/null
+++ b/dependencies/libcudacxx
@@ -0,0 +1 @@
+Subproject commit 55dd2c99346baa3a14949a0f7e9c41865e434eda
diff --git a/doc/thrust.dox b/doc/thrust.dox
deleted file mode 100644
index f1dc884f8..000000000
--- a/doc/thrust.dox
+++ /dev/null
@@ -1,1078 +0,0 @@
-# Doxyfile 1.3.4
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project
-#
-# All text after a hash (#) is considered a comment and will be ignored
-# The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ")
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# The PROJECT_NAME tag is a single word (or a sequence of words surrounded 
-# by quotes) that should identify the project.
-
-PROJECT_NAME           = thrust
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number. 
-# This could be handy for archiving the generated documentation or 
-# if some version control system is used.
-
-PROJECT_NUMBER         = 
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) 
-# base path where the generated documentation will be put. 
-# If a relative path is entered, it will be relative to the location 
-# where doxygen was started. If left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = targets/doc
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all 
-# documentation generated by doxygen is written. Doxygen will use this 
-# information to generate all constant output in the proper language. 
-# The default language is English, other supported languages are: 
-# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, 
-# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en 
-# (Japanese with English messages), Korean, Norwegian, Polish, Portuguese, 
-# Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish, and Ukrainian.
-
-OUTPUT_LANGUAGE        = English
-
-# This tag can be used to specify the encoding used in the generated output. 
-# The encoding is not always determined by the language that is chosen, 
-# but also whether or not the output is meant for Windows or non-Windows users. 
-# In case there is a difference, setting the USE_WINDOWS_ENCODING tag to YES 
-# forces the Windows encoding (this is the default for the Windows binary), 
-# whereas setting the tag to NO uses a Unix-style encoding (the default for 
-# all platforms other than Windows).
-
-USE_WINDOWS_ENCODING   = NO
-
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will 
-# include brief member descriptions after the members that are listed in 
-# the file and class documentation (similar to JavaDoc). 
-# Set to NO to disable this.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend 
-# the brief description of a member or function before the detailed description. 
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the 
-# brief descriptions will be completely suppressed.
-
-REPEAT_BRIEF           = YES
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then 
-# Doxygen will generate a detailed section even if there is only a brief 
-# description.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all inherited 
-# members of a class in the documentation of that class as if those members were 
-# ordinary class members. Constructors, destructors and assignment operators of 
-# the base classes will not be shown.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full 
-# path before files name in the file list and in the header files. If set 
-# to NO the shortest path that makes the file name unique will be used.
-
-FULL_PATH_NAMES        = YES
-
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag 
-# can be used to strip a user-defined part of the path. Stripping is 
-# only done if one of the specified strings matches the left-hand part of 
-# the path. It is allowed to use relative paths in the argument list.
-
-STRIP_FROM_PATH        = 
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter 
-# (but less readable) file names. This can be useful is your file systems 
-# doesn't support long names like on DOS, Mac, or CD-ROM.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen 
-# will interpret the first line (until the first dot) of a JavaDoc-style 
-# comment as the brief description. If set to NO, the JavaDoc 
-# comments will behave just like the Qt-style comments (thus requiring an 
-# explict @brief command for a brief description.
-
-JAVADOC_AUTOBRIEF      = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen 
-# treat a multi-line C++ special comment block (i.e. a block of //! or /// 
-# comments) as a brief description. This used to be the default behaviour. 
-# The new default is to treat a multi-line C++ comment block as a detailed 
-# description. Set this tag to YES if you prefer the old behaviour instead.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the DETAILS_AT_TOP tag is set to YES then Doxygen 
-# will output the detailed description near the top, like JavaDoc.
-# If set to NO, the detailed description appears after the member 
-# documentation.
-
-DETAILS_AT_TOP         = NO
-
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented 
-# member inherits the documentation from any documented member that it 
-# reimplements.
-
-INHERIT_DOCS           = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC 
-# tag is set to YES, then doxygen will reuse the documentation of the first 
-# member in the group (if any) for the other members of the group. By default 
-# all members of a group must be documented explicitly.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab. 
-# Doxygen uses this value to replace tabs by spaces in code fragments.
-
-TAB_SIZE               = 8
-
-# This tag can be used to specify a number of aliases that acts 
-# as commands in the documentation. An alias has the form "name=value". 
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to 
-# put the command \sideeffect (or @sideeffect) in the documentation, which 
-# will result in a user-defined paragraph with heading "Side Effects:". 
-# You can put \n's in the value part of an alias to insert newlines.
-
-ALIASES                = 
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources 
-# only. Doxygen will then generate output that is more tailored for C. 
-# For instance, some of the names that are used will be different. The list 
-# of all members will be omitted, etc.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java sources 
-# only. Doxygen will then generate output that is more tailored for Java. 
-# For instance, namespaces will be presented as packages, qualified scopes 
-# will look different, etc.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the SUBGROUPING tag to YES (the default) to allow class member groups of 
-# the same type (for instance a group of public functions) to be put as a 
-# subgroup of that type (e.g. under the Public Functions section). Set it to 
-# NO to prevent subgrouping. Alternatively, this can be done per class using 
-# the \nosubgrouping command.
-
-SUBGROUPING            = YES
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in 
-# documentation are documented, even if no documentation was available. 
-# Private class members and static file members will be hidden unless 
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class 
-# will be included in the documentation.
-
-EXTRACT_PRIVATE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file 
-# will be included in the documentation.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) 
-# defined locally in source files will be included in the documentation. 
-# If set to NO only classes defined in header files are included.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all 
-# undocumented members of documented classes, files or namespaces. 
-# If set to NO (the default) these members will be included in the 
-# various overviews, but no documentation section is generated. 
-# This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_MEMBERS     = NO
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all 
-# undocumented classes that are normally visible in the class hierarchy. 
-# If set to NO (the default) these classes will be included in the various 
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_CLASSES     = NO
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all 
-# friend (class|struct|union) declarations. 
-# If set to NO (the default) these declarations will be included in the 
-# documentation.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any 
-# documentation blocks found inside the body of a function. 
-# If set to NO (the default) these blocks will be appended to the 
-# function's detailed documentation block.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation 
-# that is typed after a \internal command is included. If the tag is set 
-# to NO (the default) then the documentation will be excluded. 
-# Set it to YES to include the internal documentation.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate 
-# file names in lower-case letters. If set to YES upper-case letters are also 
-# allowed. This is useful if you have classes or files whose names only differ 
-# in case and if your file system supports case sensitive file names. Windows 
-# users are advised to set this option to NO.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen 
-# will show members with their full class and namespace scopes in the 
-# documentation. If set to YES the scope will be hidden.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen 
-# will put a list of the files that are included by a file in the documentation 
-# of that file.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline] 
-# is inserted in the documentation for inline members.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen 
-# will sort the (detailed) documentation of file and class members 
-# alphabetically by member name. If set to NO the members will appear in 
-# declaration order.
-
-SORT_MEMBER_DOCS       = YES
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or 
-# disable (NO) the todo list. This list is created by putting \todo 
-# commands in the documentation.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or 
-# disable (NO) the test list. This list is created by putting \test 
-# commands in the documentation.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or 
-# disable (NO) the bug list. This list is created by putting \bug 
-# commands in the documentation.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or 
-# disable (NO) the deprecated list. This list is created by putting 
-# \deprecated commands in the documentation.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional 
-# documentation sections, marked by \if sectionname ... \endif.
-
-ENABLED_SECTIONS       = 
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines 
-# the initial value of a variable or define consists of for it to appear in 
-# the documentation. If the initializer consists of more lines than specified 
-# here it will be hidden. Use a value of 0 to hide initializers completely. 
-# The appearance of the initializer of individual variables and defines in the 
-# documentation can be controlled using \showinitializer or \hideinitializer 
-# command in the documentation regardless of this setting.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated 
-# at the bottom of the documentation of classes and structs. If set to YES the 
-# list will mention the files that were used to generate the documentation.
-
-SHOW_USED_FILES        = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated 
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are 
-# generated by doxygen. Possible values are YES and NO. If left blank 
-# NO is used.
-
-WARNINGS               = YES
-
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings 
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will 
-# automatically be disabled.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for 
-# potential errors in the documentation, such as not documenting some 
-# parameters in a documented function, or documenting parameters that 
-# don't exist or using markup commands wrongly.
-
-WARN_IF_DOC_ERROR      = YES
-
-# The WARN_FORMAT tag determines the format of the warning messages that 
-# doxygen can produce. The string should contain the $file, $line, and $text 
-# tags, which will be replaced by the file and line number from which the 
-# warning originated and the warning text.
-
-WARN_FORMAT            = "$file:$line: $text"
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning 
-# and error messages should be written. If left blank the output is written 
-# to stderr.
-
-WARN_LOGFILE           = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag can be used to specify the files and/or directories that contain 
-# documented source files. You may enter file names like "myfile.cpp" or 
-# directories like "/usr/src/myproject". Separate the files or directories 
-# with spaces.
-
-INPUT                  = thrust examples
-
-# If the value of the INPUT tag contains directories, you can use the 
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
-# blank the following patterns are tested: 
-# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx *.hpp 
-# *.h++ *.idl *.odl *.cs *.php *.php3 *.inc
-
-FILE_PATTERNS          = 
-
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories 
-# should be searched for input files as well. Possible values are YES and NO. 
-# If left blank NO is used.
-
-RECURSIVE              = YES
-
-# The EXCLUDE tag can be used to specify files and/or directories that should 
-# excluded from the INPUT source files. This way you can easily exclude a 
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-
-EXCLUDE                = examples
-
-# The EXCLUDE_SYMLINKS tag can be used select whether or not files or directories 
-# that are symbolic links (a Unix filesystem feature) are excluded from the input.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the 
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude 
-# certain files from those directories.
-
-EXCLUDE_PATTERNS       = */detail/*
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or 
-# directories that contain example code fragments that are included (see 
-# the \include command).
-
-EXAMPLE_PATH           = examples
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the 
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
-# and *.h) to filter out the source-files in the directories. If left 
-# blank all files are included.
-
-EXAMPLE_PATTERNS       = 
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be 
-# searched for input files to be used with the \include or \dontinclude 
-# commands irrespective of the value of the RECURSIVE tag. 
-# Possible values are YES and NO. If left blank NO is used.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or 
-# directories that contain image that are included in the documentation (see 
-# the \image command).
-
-IMAGE_PATH             = 
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should 
-# invoke to filter for each input file. Doxygen will invoke the filter program 
-# by executing (via popen()) the command <filter> <input-file>, where <filter> 
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an 
-# input file. Doxygen will then use the output that the filter program writes 
-# to standard output.
-
-INPUT_FILTER           = 
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using 
-# INPUT_FILTER) will be used to filter the input files when producing source 
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
-
-FILTER_SOURCE_FILES    = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will 
-# be generated. Documented entities will be cross-referenced with these sources.
-
-SOURCE_BROWSER         = NO
-
-# Setting the INLINE_SOURCES tag to YES will include the body 
-# of functions and classes directly in the documentation.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct 
-# doxygen to hide any special comment blocks from generated source code 
-# fragments. Normal C and C++ comments will always remain visible.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES (the default) 
-# then for each documented function all documented 
-# functions referencing it will be listed.
-
-REFERENCED_BY_RELATION = YES
-
-# If the REFERENCES_RELATION tag is set to YES (the default) 
-# then for each documented function all documented entities 
-# called/used by that function will be listed.
-
-REFERENCES_RELATION    = YES
-
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen 
-# will generate a verbatim copy of the header file for each class for 
-# which an include is specified. Set to NO to disable this.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index 
-# of all compounds will be generated. Enable this if the project 
-# contains a lot of classes, structs, unions or interfaces.
-
-ALPHABETICAL_INDEX     = NO
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then 
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns 
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all 
-# classes will be put under the same header in the alphabetical index. 
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that 
-# should be ignored while generating the index headers.
-
-IGNORE_PREFIX          = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will 
-# generate HTML output.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `html' will be used as the default path.
-
-HTML_OUTPUT            = html
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for 
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank 
-# doxygen will generate files with .html extension.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a personal HTML header for 
-# each generated HTML page. If it is left blank doxygen will generate a 
-# standard header.
-
-HTML_HEADER            = 
-
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for 
-# each generated HTML page. If it is left blank doxygen will generate a 
-# standard footer.
-
-HTML_FOOTER            = 
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading 
-# style sheet that is used by each HTML page. It can be used to 
-# fine-tune the look of the HTML output. If the tag is left blank doxygen 
-# will generate a default style sheet
-
-HTML_STYLESHEET        = 
-
-# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes, 
-# files or namespaces will be aligned in HTML using tables. If set to 
-# NO a bullet list will be used.
-
-HTML_ALIGN_MEMBERS     = YES
-
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files 
-# will be generated that can be used as input for tools like the 
-# Microsoft HTML help workshop to generate a compressed HTML help file (.chm) 
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP      = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can 
-# be used to specify the file name of the resulting .chm file. You 
-# can add a path in front of the file if the result should not be 
-# written to the html output dir.
-
-CHM_FILE               = 
-
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can 
-# be used to specify the location (absolute path including file name) of 
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run 
-# the HTML help compiler on the generated index.hhp.
-
-HHC_LOCATION           = 
-
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag 
-# controls if a separate .chi index file is generated (YES) or that 
-# it should be included in the master .chm file (NO).
-
-GENERATE_CHI           = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag 
-# controls whether a binary table of contents is generated (YES) or a 
-# normal table of contents (NO) in the .chm file.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members 
-# to the contents of the HTML help documentation and to the tree view.
-
-TOC_EXPAND             = NO
-
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index at 
-# top of each HTML page. The value NO (the default) enables the index and 
-# the value YES disables it.
-
-DISABLE_INDEX          = NO
-
-# This tag can be used to set the number of enum values (range [1..20]) 
-# that doxygen will group on one line in the generated HTML documentation.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
-# generated containing a tree-like index structure (just like the one that 
-# is generated for HTML Help). For this to work a browser that supports 
-# JavaScript, DHTML, CSS and frames is required (for instance Mozilla 1.0+, 
-# Netscape 6.0+, Internet explorer 5.0+, or Konqueror). Windows users are 
-# probably better off using the HTML help feature.
-
-GENERATE_TREEVIEW      = NO
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be 
-# used to set the initial width (in pixels) of the frame in which the tree 
-# is shown.
-
-TREEVIEW_WIDTH         = 250
-
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will 
-# generate Latex output.
-
-GENERATE_LATEX         = NO
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `latex' will be used as the default path.
-
-LATEX_OUTPUT           = latex
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be 
-# invoked. If left blank `latex' will be used as the default command name.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to 
-# generate index for LaTeX. If left blank `makeindex' will be used as the 
-# default command name.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact 
-# LaTeX documents. This may be useful for small projects and may help to 
-# save some trees in general.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used 
-# by the printer. Possible values are: a4, a4wide, letter, legal and 
-# executive. If left blank a4wide will be used.
-
-PAPER_TYPE             = a4wide
-
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX 
-# packages that should be included in the LaTeX output.
-
-EXTRA_PACKAGES         = 
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for 
-# the generated latex document. The header should contain everything until 
-# the first chapter. If it is left blank doxygen will generate a 
-# standard header. Notice: only use this tag if you know what you are doing!
-
-LATEX_HEADER           = 
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated 
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will 
-# contain links (just like the HTML output) instead of page references 
-# This makes the output suitable for online browsing using a pdf viewer.
-
-PDF_HYPERLINKS         = NO
-
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of 
-# plain latex in the generated Makefile. Set this option to YES to get a 
-# higher quality PDF documentation.
-
-USE_PDFLATEX           = NO
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. 
-# command to the generated LaTeX files. This will instruct LaTeX to keep 
-# running if errors occur, instead of asking the user for help. 
-# This option is also used when generating formulas in HTML.
-
-LATEX_BATCHMODE        = NO
-
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not 
-# include the index chapters (such as File Index, Compound Index, etc.) 
-# in the output.
-
-LATEX_HIDE_INDICES     = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output 
-# The RTF output is optimised for Word 97 and may not look very pretty with 
-# other RTF readers or editors.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `rtf' will be used as the default path.
-
-RTF_OUTPUT             = rtf
-
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact 
-# RTF documents. This may be useful for small projects and may help to 
-# save some trees in general.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated 
-# will contain hyperlink fields. The RTF file will 
-# contain links (just like the HTML output) instead of page references. 
-# This makes the output suitable for online browsing using WORD or other 
-# programs which support those fields. 
-# Note: wordpad (write) and others do not support links.
-
-RTF_HYPERLINKS         = NO
-
-# Load stylesheet definitions from file. Syntax is similar to doxygen's 
-# config file, i.e. a series of assigments. You only have to provide 
-# replacements, missing definitions are set to their default value.
-
-RTF_STYLESHEET_FILE    = 
-
-# Set optional variables used in the generation of an rtf document. 
-# Syntax is similar to doxygen's config file.
-
-RTF_EXTENSIONS_FILE    = 
-
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will 
-# generate man pages
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `man' will be used as the default path.
-
-MAN_OUTPUT             = man
-
-# The MAN_EXTENSION tag determines the extension that is added to 
-# the generated man pages (default is the subroutine's section .3)
-
-MAN_EXTENSION          = .3
-
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output, 
-# then it will generate one additional man file for each entity 
-# documented in the real man page(s). These additional files 
-# only source the real man page, but without them the man command 
-# would be unable to find the correct page. The default is NO.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES Doxygen will 
-# generate an XML file that captures the structure of 
-# the code including all documentation. Note that this 
-# feature is still experimental and incomplete at the 
-# moment.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put. 
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be 
-# put in front of it. If left blank `xml' will be used as the default path.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify an XML schema, 
-# which can be used by a validating XML parser to check the 
-# syntax of the XML files.
-
-XML_SCHEMA             = 
-
-# The XML_DTD tag can be used to specify an XML DTD, 
-# which can be used by a validating XML parser to check the 
-# syntax of the XML files.
-
-XML_DTD                = 
-
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will 
-# generate an AutoGen Definitions (see autogen.sf.net) file 
-# that captures the structure of the code including all 
-# documentation. Note that this feature is still experimental 
-# and incomplete at the moment.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will 
-# generate a Perl module file that captures the structure of 
-# the code including all documentation. Note that this 
-# feature is still experimental and incomplete at the 
-# moment.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate 
-# the necessary Makefile rules, Perl scripts and LaTeX code to be able 
-# to generate PDF and DVI output from the Perl module output.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be 
-# nicely formatted so it can be parsed by a human reader.  This is useful 
-# if you want to understand what is going on.  On the other hand, if this 
-# tag is set to NO the size of the Perl module output will be much smaller 
-# and Perl will parse it just the same.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file 
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. 
-# This is useful so different doxyrules.make files included by the same 
-# Makefile don't overwrite each other's variables.
-
-PERLMOD_MAKEVAR_PREFIX = 
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor   
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will 
-# evaluate all C-preprocessor directives found in the sources and include 
-# files.
-
-ENABLE_PREPROCESSING   = NO
-
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro 
-# names in the source code. If set to NO (the default) only conditional 
-# compilation will be performed. Macro expansion can be done in a controlled 
-# way by setting EXPAND_ONLY_PREDEF to YES.
-
-MACRO_EXPANSION        = NO
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES 
-# then the macro expansion is limited to the macros specified with the 
-# PREDEFINED and EXPAND_AS_PREDEFINED tags.
-
-EXPAND_ONLY_PREDEF     = NO
-
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files 
-# in the INCLUDE_PATH (see below) will be search if a #include is found.
-
-SEARCH_INCLUDES        = NO
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that 
-# contain include files that are not input files but should be processed by 
-# the preprocessor.
-
-INCLUDE_PATH           = 
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard 
-# patterns (like *.h and *.hpp) to filter out the header-files in the 
-# directories. If left blank, the patterns specified with FILE_PATTERNS will 
-# be used.
-
-INCLUDE_FILE_PATTERNS  = 
-
-# The PREDEFINED tag can be used to specify one or more macro names that 
-# are defined before the preprocessor is started (similar to the -D option of 
-# gcc). The argument of the tag is a list of macros of the form: name 
-# or name=definition (no spaces). If the definition and the = are 
-# omitted =1 is assumed.
-
-PREDEFINED             = 
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then 
-# this tag can be used to specify a list of macro names that should be expanded. 
-# The macro definition that is found in the sources will be used. 
-# Use the PREDEFINED tag if you want to use a different macro definition.
-
-EXPAND_AS_DEFINED      = 
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then 
-# doxygen's preprocessor will remove all function-like macros that are alone 
-# on a line, have an all uppercase name, and do not end with a semicolon. Such 
-# function macros are typically used for boiler-plate code, and will confuse the 
-# parser if not removed.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration::addtions related to external references   
-#---------------------------------------------------------------------------
-
-# The TAGFILES option can be used to specify one or more tagfiles. 
-# Optionally an initial location of the external documentation 
-# can be added for each tagfile. The format of a tag file without 
-# this location is as follows: 
-#   TAGFILES = file1 file2 ... 
-# Adding location for the tag files is done as follows: 
-#   TAGFILES = file1=loc1 "file2 = loc2" ... 
-# where "loc1" and "loc2" can be relative or absolute paths or 
-# URLs. If a location is present for each tag, the installdox tool 
-# does not have to be run to correct the links.
-# Note that each tag file must have a unique name
-# (where the name does NOT include the path)
-# If a tag file is not located in the directory in which doxygen 
-# is run, you must also specify the path to the tagfile here.
-
-TAGFILES               = 
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create 
-# a tag file that is based on the input files it reads.
-
-GENERATE_TAGFILE       = 
-
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed 
-# in the class index. If set to NO only the inherited external classes 
-# will be listed.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed 
-# in the modules index. If set to NO, only the current project's groups will 
-# be listed.
-
-EXTERNAL_GROUPS        = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script 
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool   
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will 
-# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base or 
-# super classes. Setting the tag to NO turns the diagrams off. Note that this 
-# option is superceded by the HAVE_DOT option below. This is only a fallback. It is 
-# recommended to install and use dot, since it yields more powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# If set to YES, the inheritance and collaboration graphs will hide 
-# inheritance and usage relations if the target is undocumented 
-# or is not a class.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is 
-# available from the path. This tool is part of Graphviz, a graph visualization 
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section 
-# have no effect if this option is set to NO (the default)
-
-HAVE_DOT               = NO
-
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect inheritance relations. Setting this tag to YES will force the 
-# the CLASS_DIAGRAMS tag to NO.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen 
-# will generate a graph for each documented class showing the direct and 
-# indirect implementation dependencies (inheritance, containment, and 
-# class references variables) of the class with other documented classes.
-
-COLLABORATION_GRAPH    = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and 
-# collaboration diagrams in a style similiar to the OMG's Unified Modeling 
-# Language.
-
-UML_LOOK               = NO
-
-# If set to YES, the inheritance and collaboration graphs will show the 
-# relations between templates and their instances.
-
-TEMPLATE_RELATIONS     = NO
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT 
-# tags are set to YES then doxygen will generate a graph for each documented 
-# file showing the direct and indirect include dependencies of the file with 
-# other documented files.
-
-INCLUDE_GRAPH          = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and 
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each 
-# documented header file showing the documented files that directly or 
-# indirectly include this file.
-
-INCLUDED_BY_GRAPH      = YES
-
-# If the CALL_GRAPH and HAVE_DOT tags are set to YES then doxygen will 
-# generate a call dependency graph for every global function or class method. 
-# Note that enabling this option will significantly increase the time of a run. 
-# So in most cases it will be better to enable call graphs for selected 
-# functions only using the \callgraph command.
-
-CALL_GRAPH             = NO
-
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen 
-# will graphical hierarchy of all classes instead of a textual one.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images 
-# generated by dot. Possible values are png, jpg, or gif
-# If left blank png will be used.
-
-DOT_IMAGE_FORMAT       = png
-
-# The tag DOT_PATH can be used to specify the path where the dot tool can be 
-# found. If left blank, it is assumed the dot tool can be found on the path.
-
-DOT_PATH               = 
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that 
-# contain dot files that are included in the documentation (see the 
-# \dotfile command).
-
-DOTFILE_DIRS           = 
-
-# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width 
-# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
-# this value, doxygen will try to truncate the graph, so that it fits within 
-# the specified constraint. Beware that most browsers cannot cope with very 
-# large images.
-
-MAX_DOT_GRAPH_WIDTH    = 1024
-
-# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height 
-# (in pixels) of the graphs generated by dot. If a graph becomes larger than 
-# this value, doxygen will try to truncate the graph, so that it fits within 
-# the specified constraint. Beware that most browsers cannot cope with very 
-# large images.
-
-MAX_DOT_GRAPH_HEIGHT   = 1024
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the 
-# graphs generated by dot. A depth value of 3 means that only nodes reachable 
-# from the root by following a path via at most 3 edges will be shown. Nodes that 
-# lay further from the root node will be omitted. Note that setting this option to 
-# 1 or 2 may greatly reduce the computation time needed for large code bases. Also 
-# note that a graph may be further truncated if the graph's image dimensions are 
-# not sufficient to fit the graph (see MAX_DOT_GRAPH_WIDTH and MAX_DOT_GRAPH_HEIGHT). 
-# If 0 is used for the depth value (the default), the graph is not depth-constrained.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will 
-# generate a legend page explaining the meaning of the various boxes and 
-# arrows in the dot generated graphs.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will 
-# remove the intermediate dot files that are used to generate 
-# the various graphs.
-
-DOT_CLEANUP            = YES
-
-#---------------------------------------------------------------------------
-# Configuration::addtions related to the search engine   
-#---------------------------------------------------------------------------
-
-# The SEARCHENGINE tag specifies whether or not a search engine should be 
-# used. If set to NO the values of all tags below this one will be ignored.
-
-SEARCHENGINE           = NO
diff --git a/doc/thrust_logo.png b/doc/thrust_logo.png
deleted file mode 100644
index 123794b6a..000000000
Binary files a/doc/thrust_logo.png and /dev/null differ
diff --git a/doc/thrust_logo.svg b/doc/thrust_logo.svg
deleted file mode 100644
index 4fd82acaf..000000000
--- a/doc/thrust_logo.svg
+++ /dev/null
@@ -1,272 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!-- Created with Inkscape (http://www.inkscape.org/) -->
-<svg
-   xmlns:dc="http://purl.org/dc/elements/1.1/"
-   xmlns:cc="http://creativecommons.org/ns#"
-   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-   xmlns:svg="http://www.w3.org/2000/svg"
-   xmlns="http://www.w3.org/2000/svg"
-   xmlns:xlink="http://www.w3.org/1999/xlink"
-   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
-   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
-   width="1052.3622"
-   height="744.09448"
-   id="svg2"
-   sodipodi:version="0.32"
-   inkscape:version="0.46"
-   version="1.0"
-   sodipodi:docname="thrust_logo.svg"
-   inkscape:output_extension="org.inkscape.output.svg.inkscape"
-   inkscape:export-filename="/home/nathan/Desktop/Old/logos/thrust3svg.jpg.png"
-   inkscape:export-xdpi="90"
-   inkscape:export-ydpi="90">
-  <defs
-     id="defs4">
-    <linearGradient
-       id="linearGradient5922">
-      <stop
-         style="stop-color:#b3b3b3;stop-opacity:1;"
-         offset="0"
-         id="stop5924" />
-      <stop
-         style="stop-color:#b3b3b3;stop-opacity:0;"
-         offset="1"
-         id="stop5926" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5886">
-      <stop
-         id="stop5888"
-         offset="0"
-         style="stop-color:#666666;stop-opacity:1;" />
-      <stop
-         style="stop-color:#e3e3e3;stop-opacity:1;"
-         offset="0.47389936"
-         id="stop5890" />
-      <stop
-         id="stop5892"
-         offset="1"
-         style="stop-color:#666666;stop-opacity:1;" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5840">
-      <stop
-         id="stop5842"
-         offset="0"
-         style="stop-color:#1a1a1a;stop-opacity:1;" />
-      <stop
-         style="stop-color:#cbcbcb;stop-opacity:1;"
-         offset="0.42692322"
-         id="stop5844" />
-      <stop
-         id="stop5846"
-         offset="1"
-         style="stop-color:#252525;stop-opacity:1;" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5795">
-      <stop
-         style="stop-color:#666666;stop-opacity:1;"
-         offset="0"
-         id="stop5797" />
-      <stop
-         id="stop5805"
-         offset="0.36170211"
-         style="stop-color:#e3e3e3;stop-opacity:1;" />
-      <stop
-         style="stop-color:#666666;stop-opacity:1;"
-         offset="1"
-         id="stop5799" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5773">
-      <stop
-         style="stop-color:#3b3b3b;stop-opacity:1;"
-         offset="0"
-         id="stop5775" />
-      <stop
-         id="stop5781"
-         offset="0.4955157"
-         style="stop-color:#ececec;stop-opacity:0.49803922;" />
-      <stop
-         style="stop-color:#000000;stop-opacity:0;"
-         offset="1"
-         id="stop5777" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient5743">
-      <stop
-         style="stop-color:#626161;stop-opacity:1;"
-         offset="0"
-         id="stop5745" />
-      <stop
-         id="stop5753"
-         offset="0.44680852"
-         style="stop-color:#161882;stop-opacity:0.49803922;" />
-      <stop
-         style="stop-color:#00bb00;stop-opacity:0;"
-         offset="1"
-         id="stop5747" />
-    </linearGradient>
-    <linearGradient
-       id="linearGradient3213">
-      <stop
-         style="stop-color:#000000;stop-opacity:1;"
-         offset="0"
-         id="stop3215" />
-      <stop
-         style="stop-color:#a7a7a7;stop-opacity:0;"
-         offset="1"
-         id="stop3217" />
-    </linearGradient>
-    <inkscape:perspective
-       sodipodi:type="inkscape:persp3d"
-       inkscape:vp_x="0 : 526.18109 : 1"
-       inkscape:vp_y="0 : 1000 : 0"
-       inkscape:vp_z="744.09448 : 526.18109 : 1"
-       inkscape:persp3d-origin="372.04724 : 350.78739 : 1"
-       id="perspective10" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5795"
-       id="linearGradient5810"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="1120.5692"
-       y2="201.83484" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5795"
-       id="linearGradient5824"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1227.724,586.99847)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="663.33466"
-       y2="-144.52788" />
-    <linearGradient
-       inkscape:collect="always"
-       xlink:href="#linearGradient5840"
-       id="linearGradient5838"
-       gradientUnits="userSpaceOnUse"
-       gradientTransform="matrix(1.0247944,0,0,0.7176622,-1246.1936,214.03097)"
-       x1="771.13623"
-       y1="-287.25806"
-       x2="1137.2974"
-       y2="174.0116" />
-  </defs>
-  <sodipodi:namedview
-     id="base"
-     pagecolor="#ffffff"
-     bordercolor="#666666"
-     borderopacity="1.0"
-     gridtolerance="10000"
-     guidetolerance="10"
-     objecttolerance="10"
-     inkscape:pageopacity="0.0"
-     inkscape:pageshadow="2"
-     inkscape:zoom="1"
-     inkscape:cx="513.86573"
-     inkscape:cy="372.04724"
-     inkscape:document-units="px"
-     inkscape:current-layer="layer1"
-     showgrid="false"
-     inkscape:window-width="1920"
-     inkscape:window-height="1125"
-     inkscape:window-x="0"
-     inkscape:window-y="25" />
-  <metadata
-     id="metadata7">
-    <rdf:RDF>
-      <cc:Work
-         rdf:about="">
-        <dc:format>image/svg+xml</dc:format>
-        <dc:type
-           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
-      </cc:Work>
-    </rdf:RDF>
-  </metadata>
-  <g
-     inkscape:label="Layer 1"
-     inkscape:groupmode="layer"
-     id="layer1">
-    <g
-       id="g3189"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999">
-      <path
-         d="M 256.90625,209.34375 C 245.27561,209.38319 234.38709,213.94209 226.03125,221.0625 C 216.48171,229.20011 209.59283,242.94767 214.65625,256.65625 L 288.125,455.5625 C 291.48237,464.65215 295.87551,473.99003 303.21875,481.625 C 310.56199,489.25997 321.45303,494.71875 334.15625,494.71875 L 805.34375,494.71875 C 817.97624,494.71876 828.98878,489.54948 836.625,481.90625 C 844.26122,474.26302 848.88495,464.56763 851.65625,454.6875 L 889.5,319.75 C 893.24724,306.39046 886.23452,293.51892 877,286.21875 C 867.76548,278.91858 856.12028,274.84557 844.4375,273.5625 L 261.9375,209.59375 C 260.25138,209.40857 258.56777,209.33812 256.90625,209.34375 z"
-         inkscape:href="#rect2474"
-         id="path3265"
-         style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1"
-         xlink:href="#rect2474"
-         inkscape:original="M 258.6875 221.03125 C 239.30554 218.90262 217.29031 236.04476 223.4375 252.6875 L 296.90625 451.59375 C 303.05344 468.2365 312.62987 483.21875 332.15625 483.21875 L 803.34375 483.21875 C 822.87016 483.21876 833.82448 468.59699 838.59375 451.59375 L 876.4375 316.65625 C 881.20677 299.65302 860.56946 287.12863 841.1875 285 L 258.6875 221.03125 z "
-         inkscape:radius="11.495221"
-         sodipodi:type="inkscape:offset" />
-      <path
-         sodipodi:nodetypes="czzzzzzzz"
-         id="rect2474"
-         d="M 841.1984,285.00037 L 258.69824,221.02711 C 239.31628,218.89848 217.30488,236.03474 223.45207,252.67748 L 296.91964,451.58125 C 303.06684,468.22399 312.63943,483.23161 332.16581,483.23161 L 803.35147,483.23161 C 822.87785,483.23161 833.82838,468.58449 838.59765,451.58125 L 876.44458,316.65074 C 881.21385,299.6475 860.58036,287.129 841.1984,285.00037 z"
-         style="fill:#66b366;fill-opacity:1;stroke:#000000;stroke-width:2.10967277999999991;stroke-linejoin:round;stroke-miterlimit:4;stroke-opacity:1" />
-    </g>
-    <g
-       id="g3251"
-       transform="matrix(0.913744,0,0,0.3451662,176.2736,220.85042)"
-       style="opacity:1"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999">
-      <g
-         id="g3253"
-         transform="matrix(2.0484578,-1.263301,0.1197948,2.5356515,-182.46458,-362.9203)">
-        <path
-           sodipodi:type="inkscape:offset"
-           inkscape:radius="5.4485359"
-           inkscape:original="M 291.6875 279 C 206.19469 277.76693 90.813927 330.28055 44.5625 378.59375 C 119.00866 442.66663 390.60576 547.17687 393.5 375.5625 C 394.67595 305.83429 350.18258 279.84368 291.6875 279 z "
-           xlink:href="#path3255"
-           style="fill:#666666;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           id="path3263"
-           inkscape:href="#path3255"
-           d="M 291.78125,273.5625 C 247.88427,272.92937 197.14434,285.95647 151.3125,305.1875 C 105.48066,324.41853 64.633863,349.73338 40.625,374.8125 C 39.587603,375.89202 39.04008,377.35083 39.111013,378.84633 C 39.181946,380.34183 39.865085,381.74226 41,382.71875 C 79.595929,415.93675 166.14169,457.95278 244.96875,470.84375 C 284.38228,477.28923 321.94436,476.49105 350.625,462.34375 C 379.30564,448.19645 398.18956,420.0057 398.9375,375.65625 C 399.5452,339.62233 388.08647,313.71403 368.46875,297.28125 C 348.85103,280.84847 321.81559,273.99569 291.78125,273.5625 z" />
-        <path
-           style="fill:#ffee00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 393.50906,375.56396 C 396.40371,203.9253 122.46857,297.21173 44.57143,378.58133 C 119.01759,442.65421 390.61482,547.17833 393.50906,375.56396 z"
-           id="path3255"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#ffb500;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 385.4286,375.1448 C 388.01423,252.50309 143.32293,319.15945 73.741661,377.30082 C 140.24036,423.0831 382.84333,497.76917 385.4286,375.1448 z"
-           id="path3257"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#ff6c00;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 382.08135,375.00509 C 383.96651,268.69569 205.56124,326.47536 154.8293,376.87398 C 203.31374,416.55939 380.19638,481.29945 382.08135,375.00509 z"
-           id="path3259"
-           sodipodi:nodetypes="ccz" />
-        <path
-           style="fill:#e42800;fill-opacity:1;fill-rule:evenodd;stroke:none;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
-           d="M 378.29864,374.84209 C 379.58638,287.58705 257.71919,335.01058 223.06461,376.37601 C 256.18393,408.9484 377.01103,462.08477 378.29864,374.84209 z"
-           id="path3261"
-           sodipodi:nodetypes="ccz" />
-      </g>
-    </g>
-    <text
-       xml:space="preserve"
-       style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;opacity:1;fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1.99999785;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic"
-       x="352.8208"
-       y="466.72366"
-       id="text3247"
-       transform="matrix(1.0688669,0,-0.2132749,0.9355701,0,0)"
-       inkscape:export-filename="/home/nathan/NV/thrust/doc/thrust_logo.png"
-       inkscape:export-xdpi="47.029999"
-       inkscape:export-ydpi="47.029999"><tspan
-         sodipodi:role="line"
-         id="tspan3249"
-         x="352.8208"
-         y="466.72366"
-         style="font-size:178.33847046px;font-style:italic;font-variant:normal;font-weight:bold;font-stretch:normal;fill:#ffffff;stroke:#000000;stroke-width:1.99999785;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1;font-family:UnDotum;-inkscape-font-specification:UnDotum Bold Italic">Thrust</tspan></text>
-  </g>
-</svg>
diff --git a/docs/doxybook/config.json b/docs/doxybook/config.json
new file mode 100644
index 000000000..56b7a238b
--- /dev/null
+++ b/docs/doxybook/config.json
@@ -0,0 +1,49 @@
+{
+  "baseUrl": "{{ site.baseurl }}/api/",
+  "copyImages": true,
+  "fileExt": "md",
+  "filesFilter": [],
+  "folderClassesName": "classes",
+  "folderExamplesName": "examples",
+  "folderFilesName": "files",
+  "folderGroupsName": "groups",
+  "folderNamespacesName": "namespaces",
+  "folderRelatedPagesName": "pages",
+  "imagesFolder": "images",
+  "indexClassesName": "index_classes",
+  "indexClassesTitle": "Classes",
+  "indexExamplesName": "index_examples",
+  "indexExamplesTitle": "Examples",
+  "indexFilesName": "index_files",
+  "indexFilesTitle": "Files",
+  "indexGroupsName": "index_groups",
+  "indexGroupsTitle": "Groups",
+  "indexInFolders": false,
+  "indexNamespacesName": "index_namespaces",
+  "indexNamespacesTitle": "namespaces",
+  "indexRelatedPagesName": "index_pages",
+  "indexRelatedPagesTitle": "pages",
+  "linkLowercase": true,
+  "linkAndInlineCodeAsHTML": true,
+  "linkSuffix": ".html",
+  "mainPageInRoot": false,
+  "mainPageName": "indexpage",
+  "sort": false,
+  "templateIndexClasses": "index_classes",
+  "templateIndexExamples": "index_examples",
+  "templateIndexFiles": "index_files",
+  "templateIndexGroups": "index_groups",
+  "templateIndexNamespaces": "index_namespaces",
+  "templateIndexRelatedPages": "index_pages",
+  "templateKindClass": "kind_class",
+  "templateKindDir": "kind_file",
+  "templateKindExample": "kind_page",
+  "templateKindFile": "kind_file",
+  "templateKindGroup": "kind_nonclass",
+  "templateKindInterface": "kind_class",
+  "templateKindNamespace": "kind_nonclass",
+  "templateKindPage": "kind_page",
+  "templateKindStruct": "kind_class",
+  "templateKindUnion": "kind_class",
+  "useFolders": true
+}
diff --git a/docs/doxybook/templates/class_members.tmpl b/docs/doxybook/templates/class_members.tmpl
new file mode 100644
index 000000000..cb5f65f38
--- /dev/null
+++ b/docs/doxybook/templates/class_members.tmpl
@@ -0,0 +1,210 @@
+{%- if exists("publicClasses") or exists("publicTypes") or exists("publicAttributes") or exists("publicFunctions") or exists("friends") -%}
+  {%- set has_public_members = true -%}
+{%- endif -%}
+{%- if exists("protectedClasses") or exists("protectedTypes") or exists("protectedAttributes") or exists("protectedFunctions") -%}
+  {%- set has_protected_members = true -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}
+  {%- for base in baseClasses -%}
+    {%- if existsIn(base, "publicClasses") or existsIn(base, "publicTypes") or existsIn(base, "publicAttributes") or existsIn(base, "publicFunctions") or existsIn(base, "friends") -%}
+      {%- set has_public_members = true -%}
+    {%- endif -%}
+    {%- if existsIn(base, "protectedClasses") or existsIn(base, "protectedTypes") or existsIn(base, "protectedAttributes") or existsIn(base, "protectedFunctions") -%}
+      {%- set has_protected_members = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+<code class="doxybook">
+{%- if exists("includes") -%}
+  <span>#include {{includes}}</span>{{ noop() -}}
+  <br>
+{%- endif -%}
+{%- include "synopsis_template_parameters.tmpl" -%}
+<span>{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} {</span>
+{%- set synopsis_indent_width = 2 -%}
+{%- set names_qualified = false -%}
+{%- if default(has_public_members, false) -%}
+  <span>public:</span>{{- noop() -}}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicTypes") -%}
+    {%- for child in base.publicTypes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_type.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicClasses") -%}
+    {%- for child in base.publicClasses -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_class.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("friends") -%}
+  {%- for child in friends -%}
+    {%- if child.type == "class" or child.type == "struct" -%}
+      {%- include "synopsis_friend_class.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "friends") -%}
+    {%- for child in base.friends -%}
+      {%- if child.type == "class" or child.type == "struct" -%}
+        {%- set synopsis_is_inherited = true -%}
+        {%- include "synopsis_friend_class.tmpl" -%}
+        {%- set synopsis_is_inherited = false -%}
+        {%- set synopsis_needs_leading_line_break = true -%}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicAttributes") -%}
+    {%- for child in base.publicAttributes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_variable.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- include "synopsis_function.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "publicFunctions") -%}
+    {%- for child in base.publicFunctions -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("friends") -%}
+  {%- for child in friends -%}
+    {%- if child.type != "class" and child.type != "struct" -%}
+      {%- include "synopsis_friend_function.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "friends") -%}
+    {%- for child in base.friends -%}
+      {%- if child.type != "class" and child.type != "struct" -%}
+        {%- set synopsis_is_inherited = true -%}
+        {%- include "synopsis_friend_function.tmpl" -%}
+        {%- set synopsis_is_inherited = false -%}
+        {%- set synopsis_needs_leading_line_break = true -%}
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if default(has_public_members, false) -%}
+  {%- if default(has_protected_members, false) -%}
+    <br>
+  {%- endif -%}
+{%- endif -%}
+{#- Reset leading line breaks for protected members -#}{{ noop() -}}
+{%- set synopsis_needs_leading_line_break = false -%}
+{%- if default(has_protected_members, false) -%}
+  <span>protected:</span>{{- noop() -}}
+{%- endif -%}
+{%- if exists("protectedTypes") -%}
+  {%- for child in protectedTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedTypes") -%}
+    {%- for child in base.protectedTypes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_type.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedClasses") -%}
+  {%- for child in protectedClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedClasses") -%}
+    {%- for child in base.protectedClasses -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_class.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedAttributes") -%}
+  {%- for child in protectedAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedAttributes") -%}
+    {%- for child in base.protectedAttributes -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_variable.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- if exists("protectedFunctions") -%}
+  {%- for child in protectedFunctions -%}
+    {%- include "synopsis_function.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("baseClasses") -%}{%- for base in baseClasses -%}
+  {%- if existsIn(base, "protectedFunctions") -%}
+    {%- for child in base.protectedFunctions -%}
+      {%- set synopsis_is_inherited = true -%}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_is_inherited = false -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endfor -%}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
+{%- set synopsis_indent_width = 0 -%}
+<span>};</span>
+</code>
+
diff --git a/docs/doxybook/templates/class_members_details.tmpl b/docs/doxybook/templates/class_members_details.tmpl
new file mode 100644
index 000000000..a77eec5ef
--- /dev/null
+++ b/docs/doxybook/templates/class_members_details.tmpl
@@ -0,0 +1,49 @@
+{%- if exists("publicClasses") -%}## Member Classes
+
+  {%- for child in publicClasses -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicTypes") -%}## Member Types
+
+  {%- for child in publicTypes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicAttributes") %}## Member Variables
+
+  {%- for child in publicAttributes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicFunctions") %}## Member Functions
+
+  {%- for child in publicFunctions -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("protectedTypes") -%}## Protected Member Types
+  {%- for child in publicTypes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("protectedAttributes") -%}## Protected Member Variables
+
+  {%- for child in protectedAttributes -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("protectedFunctions") -%}## Protected Member Functions
+
+  {%- for child in protectedFunctions -%}
+    {% include "title_member.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+
diff --git a/docs/doxybook/templates/details.tmpl b/docs/doxybook/templates/details.tmpl
new file mode 100644
index 000000000..d72119abf
--- /dev/null
+++ b/docs/doxybook/templates/details.tmpl
@@ -0,0 +1,206 @@
+{%- if exists("brief") -%}{{brief}}
+
+{% endif -%}
+{%- if exists("details") -%}{{details}}
+
+{% endif -%}
+{%- if exists("inbody") -%}{{inbody}}
+
+{% endif -%}
+{%- if exists("tests") -%}**Test**:
+  {%- if length(tests) == 1 -%}{{first(tests)}}
+  {%- else -%}
+    {%- for item in tests -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("note") -%}**Note**:
+  {%- if length(note) == 1 -%}{{first(note)}}
+  {%- else -%}
+    {%- for item in note -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("remark") -%}**Remark**:
+  {%- if length(remark) == 1 -%}{{first(remark)}}
+  {%- else -%}
+    {%- for item in remark -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("attention") -%}**Attention**:
+  {%- if length(attention) == 1 -%}{{first(attention)}}
+  {%- else -%}
+    {%- for item in attention -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("bugs") -%}**Bug**:
+  {%- if length(bugs) == 1 -%}{{first(bugs)}}
+  {%- else -%}
+    {%- for item in bugs -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("warning") -%}**Warning**:
+  {%- if length(warning) == 1 -%}{{first(warning)}}
+  {%- else -%}
+    {%- for item in warning -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("todos") -%}**TODO**:
+  {%- if length(todos) == 1 -%}{{first(todos)}}
+  {%- else -%}
+    {%- for item in todos -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("templateParamsList") -%}**Template Parameters**:
+  {%- if length(templateParamsList) == 1 -%}**`{{get(first(templateParamsList), "name")}}`**: {{get(first(templateParamsList), "text")}}
+  {%- else -%}
+    {%- for param in templateParamsList -%}* **`{{param.name}}`** {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("paramList") -%}**Function Parameters**:
+  {%- if length(paramList) == 1 -%}**`{{get(first(paramList), "name")}}`**: {{get(first(paramList), "text")}}
+  {%- else -%}
+    {%- for param in paramList -%}* **`{{param.name}}`** {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("pre") -%}**Preconditions**:
+  {%- if length(pre) == 1 -%}{{first(pre)}}
+  {%- else -%}
+    {%- for item in pre -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("post") -%}**Postconditions**:
+  {%- if length(post) == 1 -%}{{first(post)}}
+  {%- else -%}
+    {%- for item in post -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("invariant") -%}**Invariant**:
+  {%- if length(invariant) == 1 -%}{{first(invariant)}}
+  {%- else -%}
+    {%- for item in invariant -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("returns") or exists("returnsList") -%}**Returns**:
+  {%- if exists("returns") and exists("returnsList") -%}
+    {%- for item in returns -%}* {{item}}
+    {%- endfor -%}
+    {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}}
+    {%- endfor -%}
+  {%- else if exists("returns") -%}
+    {%- if length(returns) == 1 -%}{{first(returns)}}
+    {%- else -%} 
+      {%- for item in returns -%}* {{item}}
+      {%- endfor -%}
+    {%- endif -%}
+  {%- else if exists("returnsList") -%}
+    {%- if length(returnsList) == 1 -%}**`{{get(first(returnsList), "name")}}`** {{get(first(returnsList), "text")}}
+    {%- else -%} 
+      {%- for item in returnsList -%}* **`{{item.name}}`**: {{item.text}}
+      {%- endfor -%}
+    {%- endif -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("exceptionsList") -%}**Exceptions**:
+  {%- if length(exceptionsList) == 1 -%}**`{{get(first(exceptionsList), "name")}}`**: {{get(first(exceptionsList), "text")}}
+  {%- else -%}
+    {%- for param in exceptionsList -%}* **`{{param.name}}`**: {{param.text}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("deprecated") -%}**Deprecated**: {{deprecated}}
+
+{% endif -%}
+{%- if exists("authors") -%}**Author**:
+  {%- if length(authors) == 1 -%}{{first(authors)}}
+  {%- else -%}
+    {%- for item in authors -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("copyright") -%}**Copyright**:
+  {%- if length(copyright) == 1 -%}{{first(copyright)}}
+  {%- else -%}
+    {%- for item in copyright -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("version") -%}**Version**:
+  {%- if length(version) == 1 -%}{{first(version)}}
+  {%- else -%}
+    {%- for item in version -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("since") -%}**Since**:
+  {%- if length(since) == 1 -%}{{first(since)}}
+  {%- else -%}
+    {%- for item in since -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("date") -%}**Date**:
+  {%- if length(date) == 1 -%}{{first(date)}}
+  {%- else -%}
+    {%- for item in date -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("baseClasses") -%}**Inherits From**:
+  {%- if length(baseClasses) == 1 -%}
+    {%- if existsIn(first(baseClasses), "url") -%}[`{{get(first(baseClasses), "name")}}`]({{get(first(baseClasses), "url")}})
+    {%- else -%}`{{get(first(baseClasses), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for base in baseClasses -%}
+      {%- if existsIn(baseClasses, "url") -%}* [`{{base.name}}`]({{base.url}})
+      {%- else -%}* `{{base.name}}`
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("derivedClasses") -%}**Inherited By**:
+  {%- if length(derivedClasses) == 1 -%}
+    {%- if existsIn(first(derivedClasses), "url") -%}[`{{get(first(derivedClasses), "name")}}`]({{get(first(derivedClasses), "url")}})
+    {%- else -%}`{{get(first(derivedClasses), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for derived in derivedClasses -%}
+      {%- if existsIn(first(derivedClasses), "url") -%}* [`{{derived.name}}`]({{derived.url}})
+      {%- else -%}* `{{derived.name}}`{%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("reimplements") -%}**Implements**: [`{{reimplements.name}}`]({{reimplements.url}})
+
+{% endif -%}
+{%- if exists("reimplementedBy") -%}**Implemented By**:
+  {%- if length(reimplementedBy) == 1 -%}
+    {%- if existsIn(first(reimplementedBy), "url") -%}[`{{get(first(reimplementedBy), "name")}}`]({{get(first(reimplementedBy), "url")}})
+    {%- else -%}`{{get(first(reimplementedBy), "name")}}`
+    {%- endif -%}
+  {%- else -%}
+    {%- for impl in reimplementedBy -%}
+      {%- if existsIn(first(reimplementedBy), "url") -%}* [`{{impl.name}}`]({{impl.url}})
+      {%- else -%}* `{{impl.name}}`
+      {%- endif -%}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
+{%- if exists("see") -%}**See**:
+  {%- if length(see) == 1 -%}{{first(see)}}
+  {%- else -%}
+    {%- for item in see -%}* {{item}}
+    {%- endfor -%}
+  {%- endif %}
+{% endif -%}
diff --git a/docs/doxybook/templates/frontmatter.tmpl b/docs/doxybook/templates/frontmatter.tmpl
new file mode 100644
index 000000000..d3b1e5b4f
--- /dev/null
+++ b/docs/doxybook/templates/frontmatter.tmpl
@@ -0,0 +1,43 @@
+---
+{%- if exists("title") -%}
+  title: {{title}}
+{%- else if exists("name") -%}
+  title: {{name}}
+{%- endif -%}
+{%- if exists("summary") -%}
+  summary: {{summary}}
+{%- endif -%}
+{%- if exists("moduleBreadcrumbs") -%}
+  {%- if length(moduleBreadcrumbs) > 0 -%}
+    parent: {{ get(last(moduleBreadcrumbs), "title") }}
+  {%- endif -%}
+  {%- if length(moduleBreadcrumbs) > 1 -%}
+    grand_parent: {{ get(index(moduleBreadcrumbs, -2), "title") }}
+  {%- else if length(moduleBreadcrumbs == 1) and exists("kind") and kind == "group" -%}
+    grand_parent: API
+  {%- endif -%}
+{%- else if exists("kind") and kind == "group" -%}
+  parent: API
+{%- endif -%}
+{%- if exists("kind") and kind == "group" -%}
+  nav_exclude: false
+{%- else -%}
+  nav_exclude: true
+{%- endif -%}
+has_children: true
+has_toc: false
+---
+
+{%- if exists("title") -%}
+  {%- if exists("kind") and kind in ["class", "struct", "namespace"] -%}
+    # {{title(kind)}} `{{title}}`
+  {%- else -%}
+    # {{title}}
+  {%- endif -%}
+{%- else if exists("name") -%}
+  {%- if exists("kind") and kind != "page" -%}
+    # {{name}} {{title(kind)}} Reference
+  {%- else -%}
+    # {{name}}
+  {%- endif -%}
+{%- endif %}
diff --git a/docs/doxybook/templates/index.tmpl b/docs/doxybook/templates/index.tmpl
new file mode 100644
index 000000000..e28f37729
--- /dev/null
+++ b/docs/doxybook/templates/index.tmpl
@@ -0,0 +1,14 @@
+{%- if exists("children") -%}{%- for child in children -%}
+  {%- for i in range(default(index_depth, 0)) -%}
+    {{- noop() }}  {{ noop() -}}
+  {%- endfor -%}
+  * {{ noop() -}}
+  <b><a href="{{ child.url }}">{{ render("name_qualified.tmpl", child) }}</a></b>{{ noop() -}}
+  {%- if existsIn(child, "brief") -%}
+    {{- noop() }} <br> {{ child.brief -}}
+  {%- endif %}
+  {%- if existsIn(child, "children") -%}
+    {%- set child.index_depth = default(index_depth, 0) + 1 -%}
+    {{- render("index.tmpl", child) -}}
+  {%- endif -%}
+{%- endfor -%}{%- endif -%}
diff --git a/docs/doxybook/templates/index_classes.tmpl b/docs/doxybook/templates/index_classes.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_classes.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_examples.tmpl b/docs/doxybook/templates/index_examples.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_examples.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_files.tmpl b/docs/doxybook/templates/index_files.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_files.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_groups.tmpl b/docs/doxybook/templates/index_groups.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_groups.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_namespaces.tmpl b/docs/doxybook/templates/index_namespaces.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_namespaces.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/index_pages.tmpl b/docs/doxybook/templates/index_pages.tmpl
new file mode 100644
index 000000000..1ccdf71e9
--- /dev/null
+++ b/docs/doxybook/templates/index_pages.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{% include "index.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_class.tmpl b/docs/doxybook/templates/kind_class.tmpl
new file mode 100644
index 000000000..e5650b69b
--- /dev/null
+++ b/docs/doxybook/templates/kind_class.tmpl
@@ -0,0 +1,4 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%}
+{% include "class_members.tmpl" -%}
+{% include "class_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_example.tmpl b/docs/doxybook/templates/kind_example.tmpl
new file mode 100644
index 000000000..48501318b
--- /dev/null
+++ b/docs/doxybook/templates/kind_example.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{%- if exists("details") -%}{{details}}{%- endif -%}
diff --git a/docs/doxybook/templates/kind_file.tmpl b/docs/doxybook/templates/kind_file.tmpl
new file mode 100644
index 000000000..c883442f1
--- /dev/null
+++ b/docs/doxybook/templates/kind_file.tmpl
@@ -0,0 +1,10 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails -%}{% include "details.tmpl" -%}{%- endif -%}
+{% include "nonclass_members_details.tmpl" -%}
+{% include "nonclass_members.tmpl" -%}
+{%- if exists("programlisting") -%}
+
+```cpp
+{{programlisting}}
+```
+{%- endif -%}
diff --git a/docs/doxybook/templates/kind_group.tmpl b/docs/doxybook/templates/kind_group.tmpl
new file mode 100644
index 000000000..1ff7342a4
--- /dev/null
+++ b/docs/doxybook/templates/kind_group.tmpl
@@ -0,0 +1,4 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%}
+{% include "nonclass_members.tmpl" -%}
+{% include "nonclass_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_nonclass.tmpl b/docs/doxybook/templates/kind_nonclass.tmpl
new file mode 100644
index 000000000..299208c41
--- /dev/null
+++ b/docs/doxybook/templates/kind_nonclass.tmpl
@@ -0,0 +1,8 @@
+{% include "frontmatter.tmpl" -%}
+{%- if hasDetails %}{% include "details.tmpl" -%}{%- endif -%}
+{% if kind == "namespace" -%}
+  {%- include "namespace_members.tmpl" -%}
+{%- else -%}
+  {%- include "nonclass_members.tmpl" -%}
+{%- endif -%}
+{% include "nonclass_members_details.tmpl" -%}
diff --git a/docs/doxybook/templates/kind_page.tmpl b/docs/doxybook/templates/kind_page.tmpl
new file mode 100644
index 000000000..48501318b
--- /dev/null
+++ b/docs/doxybook/templates/kind_page.tmpl
@@ -0,0 +1,2 @@
+{% include "frontmatter.tmpl" -%}
+{%- if exists("details") -%}{{details}}{%- endif -%}
diff --git a/docs/doxybook/templates/member_details.tmpl b/docs/doxybook/templates/member_details.tmpl
new file mode 100644
index 000000000..14b34dcfc
--- /dev/null
+++ b/docs/doxybook/templates/member_details.tmpl
@@ -0,0 +1,39 @@
+{%- if exists("type") and type in ["class", "struct"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_class.tmpl" -%}
+  </code>
+{%- else if kind == "enum" -%}
+  {%- include "table_header_enum.tmpl" -%}
+  {%- for enumerator in enumvalues -%}{{- render("table_row_enum.tmpl", enumerator) -}}
+  {%- endfor %}
+{%- else if kind in ["typedef", "using"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  <span>{%- include "synopsis_kind.tmpl" -%}<b>{{name}}</b>{%- include "synopsis_initializer.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind in ["variable", "property"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  <span>{%- include "synopsis_type_and_leading_specifiers.tmpl" -%}<b>{{name}}</b>{%- include "synopsis_initializer.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind in ["function", "slot", "signal", "event"] -%}
+  <code class="doxybook">
+  {%- include "synopsis_template_parameters.tmpl" -%}
+  {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%}
+  <span><b>{{name}}</b>({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};</span>{{- noop() -}}
+  </code>
+{%- else if kind == "friend" -%}
+  {%- if type != "class" and type != "struct" -%}
+    <code class="doxybook">
+    {% include "synopsis_template_parameters.tmpl" -%}
+    {% include "synopsis_function_type_and_leading_specifiers.tmpl" -%}
+    <span><b>{{name}}</b>({%- include "synopsis_function_parameters.tmpl" -%}){%- include "synopsis_function_trailing_specifiers.tmpl" -%};</span>{{- noop() -}}
+    </code>
+  {%- endif -%}
+{%- else if kind == "define" -%}
+  {#- We have no way to get the parameters to function-like     -#}{{ noop() -}}
+  {#- macros, and the macro definitions in `initializer` fields -#}{{ noop() -}}
+  {#- don't have line breaks. So we can't render a useful       -#}{{ noop() -}}
+  {#- synopsis.                                                 -#}{{ noop() -}}
+{% endif -%}
+{% include "details.tmpl" -%}
diff --git a/docs/doxybook/templates/name.tmpl b/docs/doxybook/templates/name.tmpl
new file mode 100644
index 000000000..09f15420e
--- /dev/null
+++ b/docs/doxybook/templates/name.tmpl
@@ -0,0 +1,5 @@
+{%- if default(names_qualified, true) -%}
+  {{- render("name_qualified.tmpl", child) -}}
+{%- else -%}
+  {{- render("name_unqualified.tmpl", child) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/name_qualified.tmpl b/docs/doxybook/templates/name_qualified.tmpl
new file mode 100644
index 000000000..da088dd34
--- /dev/null
+++ b/docs/doxybook/templates/name_qualified.tmpl
@@ -0,0 +1,7 @@
+{%- if exists("qualifiedname") -%}
+  {{- escape(qualifiedname) -}}
+{%- else if exists("name") -%}
+  {{- escape(name) -}}
+{%- else -%}
+  {{- escape(title) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/name_unqualified.tmpl b/docs/doxybook/templates/name_unqualified.tmpl
new file mode 100644
index 000000000..2a0d73725
--- /dev/null
+++ b/docs/doxybook/templates/name_unqualified.tmpl
@@ -0,0 +1,5 @@
+{%- if exists("name") -%}
+  {{- escape(stripNamespace(name)) -}}
+{%- else -%}
+  {{- escape(stripNamespace(title)) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/namespace_members.tmpl b/docs/doxybook/templates/namespace_members.tmpl
new file mode 100644
index 000000000..8bb4bdffc
--- /dev/null
+++ b/docs/doxybook/templates/namespace_members.tmpl
@@ -0,0 +1,43 @@
+<code class="doxybook">
+{%- if exists("includes") -%}
+  <span>#include {{includes}}</span>{{ noop() -}}
+  <br>
+{%- endif -%}
+<span>{%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} {</span>
+{%- set synopsis_needs_leading_line_break = true -%}
+{%- set names_qualified = false -%}
+{%- if exists("namespaces") -%}
+  {%- for child in namespaces -%}
+    {%- include "synopsis_namespace_abbreviated.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- if existsIn(child, "type") -%}
+      {#- If the child doesn't have a type, it's probably a    -#}{{- noop() -}}
+      {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}}
+      {#- due to a bug whose nature is beyond me.              -#}{{- noop() -}}
+      {%- include "synopsis_function.tmpl" -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+<span>} {{ noop() -}}
+  /* {%- include "synopsis_kind.tmpl" -%}{% include "name_qualified.tmpl" %} */{{ noop() -}}
+</span>
+</code>
+
diff --git a/docs/doxybook/templates/nonclass_members.tmpl b/docs/doxybook/templates/nonclass_members.tmpl
new file mode 100644
index 000000000..af3d39c17
--- /dev/null
+++ b/docs/doxybook/templates/nonclass_members.tmpl
@@ -0,0 +1,60 @@
+{%- if exists("groups") %}## Groups
+
+  {%- for child in sort(groups) -%}* **[{{ child.title }}]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("dirs") %}## Directories
+
+  {%- for child in dirs -%}* **[`{{ child.name }}`]({{ child.url }})**{% if existsIn(child, "brief") %}: {{ child.brief }}{% endif %}
+  {%- endfor %}
+{% endif -%}
+{%- if exists("files") %}## Files
+
+  {%- include "table_header_brief.tmpl" -%}
+  {%- for child in files -%}{{- render("table_row_brief.tmpl", child) -}}
+  {%- endfor %}
+{% endif -%}
+<code class="doxybook">
+{%- if exists("namespaces") -%}
+  {%- for child in namespaces -%}
+    {%- include "synopsis_namespace_abbreviated.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicClasses") -%}
+  {%- for child in publicClasses -%}
+    {%- include "synopsis_class.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicTypes") -%}
+  {%- for child in publicTypes -%}
+    {%- include "synopsis_type.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicAttributes") -%}
+  {%- for child in publicAttributes -%}
+    {%- include "synopsis_variable.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("publicFunctions") -%}
+  {%- for child in publicFunctions -%}
+    {%- if existsIn(child, "type") -%}
+      {#- If the child doesn't have a type, it's probably a    -#}{{- noop() -}}
+      {#- constructor that Doxygen put into a non-class entity -#}{{- noop() -}}
+      {#- due to a bug whose nature is beyond me.              -#}{{- noop() -}}
+      {%- include "synopsis_function.tmpl" -%}
+      {%- set synopsis_needs_leading_line_break = true -%}
+    {%- endif -%}
+  {%- endfor -%}
+{%- endif -%}
+{%- if exists("defines") -%}
+  {%- for child in defines -%}
+    {%- include "synopsis_macro.tmpl" -%}
+    {%- set synopsis_needs_leading_line_break = true -%}
+  {%- endfor -%}
+{%- endif -%}
+</code>
+
diff --git a/docs/doxybook/templates/nonclass_members_details.tmpl b/docs/doxybook/templates/nonclass_members_details.tmpl
new file mode 100644
index 000000000..c941f22f7
--- /dev/null
+++ b/docs/doxybook/templates/nonclass_members_details.tmpl
@@ -0,0 +1,35 @@
+{%- if exists("publicClasses") -%}## Member Classes
+
+  {%- for child in publicClasses -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicTypes") -%}## Types
+
+  {%- for child in publicTypes -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicAttributes") %}## Variables
+
+  {%- for child in publicAttributes -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("publicFunctions") %}## Functions
+
+  {%- for child in publicFunctions -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
+{%- if exists("defines") %}## Macros
+
+  {%- for child in defines -%}
+    {% include "title_nonmember.tmpl" %}
+    {{- render("member_details.tmpl", child) -}}
+  {%- endfor %}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_brief.tmpl b/docs/doxybook/templates/synopsis_brief.tmpl
new file mode 100644
index 000000000..2f48cec1d
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_brief.tmpl
@@ -0,0 +1,8 @@
+{%- if exists("brief") -%}
+  <span class="doxybook-comment">{{ noop() -}}
+    {%- if default(synopsis_indent_width, 0) != 0 -%}
+      <code>{%- include "synopsis_indent.tmpl" -%}</code>
+    {%- endif -%}
+    /* {{ brief }} */{{ noop() -}}
+  </span>{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_class.tmpl b/docs/doxybook/templates/synopsis_class.tmpl
new file mode 100644
index 000000000..a5492997c
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_class.tmpl
@@ -0,0 +1,16 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{#- The Doxygen metadata that a parent has on its nested   -#}{{ noop() -}}
+{#- classes doesn't include their template parameters.     -#}{{ noop() -}}
+{#- Fortunately, we have the refid of the nested class, so -#}{{ noop() -}}
+{#- so we can just load the data from their page.          -#}{{ noop() -}}
+{%- set child_class = load(child.refid)) -%}
+{%- set child_class.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_template_parameters.tmpl", child_class) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>;{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_friend_class.tmpl b/docs/doxybook/templates/synopsis_friend_class.tmpl
new file mode 100644
index 000000000..39f23bb09
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_friend_class.tmpl
@@ -0,0 +1,14 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{#- Unfortunately, the refid and URL for a friend class  -#}{{ noop() -}}
+{#- incorrectly refers to a definition on the local      -#}{{ noop() -}}
+{#- page, instead of the friend class's own page.        -#}{{ noop() -}}
+{#- So we don't link to friend classes.                  -#}{{ noop() -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b>{{- render("name_qualified.tmpl", child) -}}</b>;{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_friend_function.tmpl b/docs/doxybook/templates/synopsis_friend_function.tmpl
new file mode 100644
index 000000000..440989c23
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_friend_function.tmpl
@@ -0,0 +1,19 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{#- Unfortunately, the refid and URL for a friend class  -#}{{ noop() -}}
+{#- incorrectly refers to a definition on the local      -#}{{ noop() -}}
+{#- page, instead of the friend class's own page.        -#}{{ noop() -}}
+{#- So we don't link to friend classes.                  -#}{{ noop() -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  friend {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}}
+</span>
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  <b>{{- render("name_qualified.tmpl", child) -}}</b>{{ noop() -}}
+  ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}}
+  {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_function.tmpl b/docs/doxybook/templates/synopsis_function.tmpl
new file mode 100644
index 000000000..93a3e822e
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function.tmpl
@@ -0,0 +1,12 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+{{- render("synopsis_function_type_and_leading_specifiers.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  ({{- render("synopsis_function_parameters.tmpl", child) -}}){{ noop() -}}
+  {{- render("synopsis_function_trailing_specifiers.tmpl", child) -}};{{- noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_function_parameters.tmpl b/docs/doxybook/templates/synopsis_function_parameters.tmpl
new file mode 100644
index 000000000..204a52c50
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_parameters.tmpl
@@ -0,0 +1,11 @@
+{%- for param in params -%}
+  {%- if not loop.is_first -%}&nbsp;&nbsp;{%- endif -%}
+  {{- param.type -}}
+  {%- if not isEmpty(param.name) %} {% endif -%}
+  {{- param.name -}}
+  {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%}
+  {%- if not loop.is_last -%}
+    ,</span>
+    {{- noop() }}<span>{% include "synopsis_indent.tmpl" -%}
+  {%- endif -%}
+{%- endfor -%}
diff --git a/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl
new file mode 100644
index 000000000..bbde0f1dd
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_trailing_specifiers.tmpl
@@ -0,0 +1,5 @@
+{%- if const %} const{% endif -%}
+{%- if override %} override{% endif -%}
+{%- if default %} = default{% endif -%}
+{%- if deleted %} = deleted{% endif -%}
+{%- if pureVirtual %} = 0{% endif -%}
diff --git a/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl
new file mode 100644
index 000000000..5cde64d28
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_function_type_and_leading_specifiers.tmpl
@@ -0,0 +1,6 @@
+{%- if default(virtual, false) or default(static, false) or default(explicit, false) or default(type, false) -%}
+  <span>{{ noop() -}}
+    {%- include "synopsis_indent.tmpl" -%}
+    {%- include "synopsis_type_and_leading_specifiers.tmpl" -%}
+  </span>{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_indent.tmpl b/docs/doxybook/templates/synopsis_indent.tmpl
new file mode 100644
index 000000000..a2d7193a6
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_indent.tmpl
@@ -0,0 +1,5 @@
+{%- if default(synopsis_indent_width, false) -%}
+  {%- for i in range(synopsis_indent_width) -%}
+    &nbsp;{{ noop() -}}
+  {%- endfor -%}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_inherited_from.tmpl b/docs/doxybook/templates/synopsis_inherited_from.tmpl
new file mode 100644
index 000000000..fd88b649c
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_inherited_from.tmpl
@@ -0,0 +1,4 @@
+{%- if default(synopsis_is_inherited, false) != false -%}
+  {%- set base.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+  {{- render("synopsis_inherited_from_comment.tmpl", base) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl
new file mode 100644
index 000000000..4afda1250
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_inherited_from_comment.tmpl
@@ -0,0 +1,8 @@
+<span class="doxybook-comment">{{ noop() -}}
+  {%- if default(synopsis_indent_width, 0) != 0 -%}
+    <code>{%- include "synopsis_indent.tmpl" -%}</code>
+  {%- endif -%}
+  /* Inherited from <code>{{ noop() -}}
+    <b><a href="{{ url }}">{%- include "name_qualified.tmpl" -%}</a></b>{{ noop() -}}
+  </code> */{{ noop() -}}
+</span>{{ noop() -}}
diff --git a/docs/doxybook/templates/synopsis_initializer.tmpl b/docs/doxybook/templates/synopsis_initializer.tmpl
new file mode 100644
index 000000000..dd159979d
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_initializer.tmpl
@@ -0,0 +1,3 @@
+{%- if kind == "using" %} = {{ escape(type) -}}
+{%- else if exists("initializer") %} {{ escape(initializer) -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl
new file mode 100644
index 000000000..2bc4d4856
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_initializer_abbreviated.tmpl
@@ -0,0 +1 @@
+{% if kind == "using" or exists("initializer") %} = <i>see below</i>{% endif -%}
diff --git a/docs/doxybook/templates/synopsis_kind.tmpl b/docs/doxybook/templates/synopsis_kind.tmpl
new file mode 100644
index 000000000..34cd602a9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_kind.tmpl
@@ -0,0 +1,9 @@
+{%- if kind == "interface" %}class {{ noop() -}}
+{%- else if kind == "namespace" %}namespace {{ noop() -}}
+{%- else if kind == "typedef" %}typedef {{ type -}}
+{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%} {{ noop() -}}
+{%- else if kind == "friend" %}friend {{ noop() -}}
+  {%- if type == "class" or type == "struct" %}{{ type }} {% endif -%}
+{%- else if kind == "define" %}#define {{ noop() -}}
+{%- else %}{{ kind }} {{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl
new file mode 100644
index 000000000..881582773
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_kind_abbreviated.tmpl
@@ -0,0 +1,9 @@
+{%- if kind == "interface" %}class {{ noop() -}}
+{%- else if kind == "namespace" %}namespace {{ noop() -}}
+{%- else if kind == "typedef" %}typedef <i>see below</i> {{ noop() -}}
+{%- else if kind == "enum" %}enum {% if strong %}class {% endif -%}
+{%- else if kind == "friend" %}friend {{ noop() -}}
+  {%- if type == "class" or type == "struct" %}{{type}} {% endif -%}
+{%- else if kind == "define" %}#define {{ noop() -}}
+{%- else %}{{ kind }} {{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_leading_line_break.tmpl b/docs/doxybook/templates/synopsis_leading_line_break.tmpl
new file mode 100644
index 000000000..13a1574e3
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_leading_line_break.tmpl
@@ -0,0 +1,3 @@
+{%- if default(synopsis_needs_leading_line_break, false) -%}
+  <br>
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_macro.tmpl b/docs/doxybook/templates/synopsis_macro.tmpl
new file mode 100644
index 000000000..612773439
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_macro.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl
new file mode 100644
index 000000000..682f615c9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_member_namespace_abbreviated.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b> { <i>…</i> }{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl
new file mode 100644
index 000000000..682f615c9
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_namespace_abbreviated.tmpl
@@ -0,0 +1,7 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+<span>{{ noop() -}}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{{- render("name_qualified.tmpl", child) -}}</a></b> { <i>…</i> }{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_template_parameters.tmpl b/docs/doxybook/templates/synopsis_template_parameters.tmpl
new file mode 100644
index 000000000..4391c3d99
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_template_parameters.tmpl
@@ -0,0 +1,14 @@
+{%- if exists("templateParams") -%}
+  <span>{% include "synopsis_indent.tmpl" -%}template &lt;{{ noop() -}}
+  {%- for param in templateParams -%}
+    {%- if not loop.is_first %}{% include "synopsis_indent.tmpl" -%}&nbsp;&nbsp;{% endif -%}
+    {{- param.type -}}
+    {%- if not isEmpty(param.name) %} {% endif -%}
+    {{- param.name -}}
+    {%- if existsIn(param, "defvalPlain") %} = {{ escape(param.defvalPlain) }}{% endif -%}
+    {%- if not loop.is_last -%}
+      ,</span>
+      {{- noop() }}<span>{% include "synopsis_indent.tmpl" -%}
+    {%- endif -%}
+  {%- endfor -%}&gt;</span>
+{%- endif -%}
diff --git a/docs/doxybook/templates/synopsis_type.tmpl b/docs/doxybook/templates/synopsis_type.tmpl
new file mode 100644
index 000000000..586555f08
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_type.tmpl
@@ -0,0 +1,11 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_kind_abbreviated.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl
new file mode 100644
index 000000000..12136020f
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_type_and_leading_specifiers.tmpl
@@ -0,0 +1,4 @@
+{%- if default(virtual, false) %}virtual {% endif -%}
+{%- if default(static, false) %}static {% endif -%}
+{%- if default(explicit, false) %}explicit {% endif -%}
+{%- if exists("type") %}{{ type }} {% endif -%}
diff --git a/docs/doxybook/templates/synopsis_variable.tmpl b/docs/doxybook/templates/synopsis_variable.tmpl
new file mode 100644
index 000000000..52c48da50
--- /dev/null
+++ b/docs/doxybook/templates/synopsis_variable.tmpl
@@ -0,0 +1,11 @@
+{%- include "synopsis_leading_line_break.tmpl" -%}
+{%- include "synopsis_inherited_from.tmpl" -%}
+{%- set child.synopsis_indent_width = default(synopsis_indent_width, 0) -%}
+{{- render("synopsis_brief.tmpl", child) -}}
+{{- render("synopsis_template_parameters.tmpl", child) -}}
+<span>{{ noop() -}}
+  {%- include "synopsis_indent.tmpl" -%}
+  {{- render("synopsis_type_and_leading_specifiers.tmpl", child) -}}
+  <b><a href="{{ child.url }}">{% include "name.tmpl" %}</a></b>{{ noop() -}}
+  {{- render("synopsis_initializer_abbreviated.tmpl", child) -}};{{ noop() -}}
+</span>
diff --git a/docs/doxybook/templates/table_header_brief.tmpl b/docs/doxybook/templates/table_header_brief.tmpl
new file mode 100644
index 000000000..ed13f970f
--- /dev/null
+++ b/docs/doxybook/templates/table_header_brief.tmpl
@@ -0,0 +1,2 @@
+| Name | Description |
+|------|-------------|
diff --git a/docs/doxybook/templates/table_header_enum.tmpl b/docs/doxybook/templates/table_header_enum.tmpl
new file mode 100644
index 000000000..cdf95bc6f
--- /dev/null
+++ b/docs/doxybook/templates/table_header_enum.tmpl
@@ -0,0 +1,2 @@
+| Enumerator | Value | Description |
+|------------|-------|-------------|
diff --git a/docs/doxybook/templates/table_row_brief.tmpl b/docs/doxybook/templates/table_row_brief.tmpl
new file mode 100644
index 000000000..1d599755f
--- /dev/null
+++ b/docs/doxybook/templates/table_row_brief.tmpl
@@ -0,0 +1 @@
+| **[`{{name}}`]({{url}})** | {% if exists("brief") %}{{brief}}{% endif %} |
diff --git a/docs/doxybook/templates/table_row_enum.tmpl b/docs/doxybook/templates/table_row_enum.tmpl
new file mode 100644
index 000000000..77c205be3
--- /dev/null
+++ b/docs/doxybook/templates/table_row_enum.tmpl
@@ -0,0 +1 @@
+| `{{ name }}` | {% if exists("initializer") -%}`{{ escape(replace(initializer, "= ", "")) }}`{%- endif %} | {% if exists("brief") -%}{{ brief }}{%- endif %} |
diff --git a/docs/doxybook/templates/title_kind.tmpl b/docs/doxybook/templates/title_kind.tmpl
new file mode 100644
index 000000000..100db2e84
--- /dev/null
+++ b/docs/doxybook/templates/title_kind.tmpl
@@ -0,0 +1,4 @@
+{%- if child.kind == "using" %}Type Alias{{ noop() -}}
+{%- else -%}{{ title(child.kind) -}}
+{%- endif -%}
+{%- if child.kind == "enum" and child.strong %} Class{%- endif -%}
diff --git a/docs/doxybook/templates/title_leading.tmpl b/docs/doxybook/templates/title_leading.tmpl
new file mode 100644
index 000000000..54eb7e967
--- /dev/null
+++ b/docs/doxybook/templates/title_leading.tmpl
@@ -0,0 +1,4 @@
+<h3 id="{{ child.kind }}-{{ safeAnchorId(child.name) }}">
+{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%}
+  <a href="{{ child.url }}">{{ noop() -}}
+{%- endif -%}
diff --git a/docs/doxybook/templates/title_member.tmpl b/docs/doxybook/templates/title_member.tmpl
new file mode 100644
index 000000000..50e70f378
--- /dev/null
+++ b/docs/doxybook/templates/title_member.tmpl
@@ -0,0 +1,4 @@
+{%- include "title_leading.tmpl" -%}
+  {%- include "title_kind.tmpl" -%}
+  {{- noop() }} <code>{% include "name_qualified.tmpl" %}::{{ render("name_unqualified.tmpl", child) }}</code>
+{%- include "title_trailing.tmpl" -%}
diff --git a/docs/doxybook/templates/title_nonmember.tmpl b/docs/doxybook/templates/title_nonmember.tmpl
new file mode 100644
index 000000000..4ea9797fd
--- /dev/null
+++ b/docs/doxybook/templates/title_nonmember.tmpl
@@ -0,0 +1,5 @@
+{%- include "title_leading.tmpl" -%}
+  {%- include "title_kind.tmpl" -%}
+  {{- noop() }} <code>{{render("name_qualified.tmpl", child)}}</code>
+{%- include "title_trailing.tmpl" -%}
+
diff --git a/docs/doxybook/templates/title_trailing.tmpl b/docs/doxybook/templates/title_trailing.tmpl
new file mode 100644
index 000000000..fcc4f24e6
--- /dev/null
+++ b/docs/doxybook/templates/title_trailing.tmpl
@@ -0,0 +1,4 @@
+{%- if existsIn(child, "kind") and child.kind in ["class", "struct"] -%}
+  </a>
+{%- endif -%}
+</h3>
diff --git a/docs/doxygen/config.dox b/docs/doxygen/config.dox
new file mode 100644
index 000000000..7e06e3545
--- /dev/null
+++ b/docs/doxygen/config.dox
@@ -0,0 +1,2611 @@
+# Doxyfile 1.9.3
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a double hash (##) is considered a comment and is placed in
+# front of the TAG it is preceding.
+#
+# All text after a single hash (#) is considered a comment and will be ignored.
+# The format is:
+# TAG = value [value, ...]
+# For lists, items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (\" \").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
+# The default value is: UTF-8.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by
+# double-quotes, unless you are using Doxywizard) that should identify the
+# project for which the documentation is generated. This name is used in the
+# title of most generated pages and in a few other places.
+# The default value is: My Project.
+
+PROJECT_NAME           = Thrust
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number. This
+# could be handy for archiving the generated documentation or if some version
+# control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer a
+# quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify a logo or an icon that is included
+# in the documentation. The maximum height of the logo should not exceed 55
+# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy
+# the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path
+# into which the generated documentation will be written. If a relative path is
+# entered, it will be relative to the location where doxygen was started. If
+# left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub-
+# directories (in 2 levels) under the output directory of each output format and
+# will distribute the generated files over these directories. Enabling this
+# option can be useful when feeding doxygen a huge amount of source files, where
+# putting all generated files in the same directory would otherwise causes
+# performance problems for the file system.
+# The default value is: NO.
+
+CREATE_SUBDIRS         = NO
+
+# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII
+# characters to appear in the names of generated files. If set to NO, non-ASCII
+# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode
+# U+3044.
+# The default value is: NO.
+
+ALLOW_UNICODE_NAMES    = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese,
+# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States),
+# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian,
+# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages),
+# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian,
+# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian,
+# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish,
+# Ukrainian and Vietnamese.
+# The default value is: English.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
+# descriptions after the members that are listed in the file and class
+# documentation (similar to Javadoc). Set to NO to disable this.
+# The default value is: YES.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief
+# description of a member or function before the detailed description
+#
+# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+# The default value is: YES.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator that is
+# used to form the text in various listings. Each string in this list, if found
+# as the leading text of the brief description, will be stripped from the text
+# and the result, after processing the whole list, is used as the annotated
+# text. Otherwise, the brief description is used as-is. If left blank, the
+# following values are used ($name is automatically replaced with the name of
+# the entity):The $name class, The $name widget, The $name file, is, provides,
+# specifies, contains, represents, a, an and the.
+
+ABBREVIATE_BRIEF       = "The $name class" \
+                         "The $name widget" \
+                         "The $name file" \
+                         is \
+                         provides \
+                         specifies \
+                         contains \
+                         represents \
+                         a \
+                         an \
+                         the
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# doxygen will generate a detailed section even if there is only a brief
+# description.
+# The default value is: NO.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+# The default value is: NO.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path
+# before files name in the file list and in the header files. If set to NO the
+# shortest path that makes the file name unique will be used
+# The default value is: YES.
+
+FULL_PATH_NAMES        = YES
+
+# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path.
+# Stripping is only done if one of the specified strings matches the left-hand
+# part of the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the path to
+# strip.
+#
+# Note that you can specify absolute paths here, but also relative paths, which
+# will be relative from the directory where doxygen is started.
+# This tag requires that the tag FULL_PATH_NAMES is set to YES.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the
+# path mentioned in the documentation of a class, which tells the reader which
+# header file to include in order to use a class. If left blank only the name of
+# the header file containing the class definition is used. Otherwise one should
+# specify the list of include paths that are normally passed to the compiler
+# using the -I flag.
+
+STRIP_FROM_INC_PATH    = .
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but
+# less readable) file names. This can be useful is your file systems doesn't
+# support long names like on DOS, Mac, or CD-ROM.
+# The default value is: NO.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the
+# first line (until the first dot) of a Javadoc-style comment as the brief
+# description. If set to NO, the Javadoc-style will behave just like regular Qt-
+# style comments (thus requiring an explicit @brief command for a brief
+# description.)
+# The default value is: NO.
+
+JAVADOC_AUTOBRIEF      = YES
+
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
+# line (until the first dot) of a Qt-style comment as the brief description. If
+# set to NO, the Qt-style will behave just like regular Qt-style comments (thus
+# requiring an explicit \brief command for a brief description.)
+# The default value is: NO.
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a
+# multi-line C++ special comment block (i.e. a block of //! or /// comments) as
+# a brief description. This used to be the default behavior. The new default is
+# to treat a multi-line C++ comment block as a detailed description. Set this
+# tag to YES if you prefer the old behavior instead.
+#
+# Note that setting this tag to YES also means that rational rose comments are
+# not recognized any more.
+# The default value is: NO.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
+# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
+# documentation from any documented member that it re-implements.
+# The default value is: YES.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new
+# page for each member. If set to NO, the documentation of a member will be part
+# of the file/class/namespace that contains it.
+# The default value is: NO.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen
+# uses this value to replace tabs by spaces in code fragments.
+# Minimum value: 1, maximum value: 16, default value: 4.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that act as commands in
+# the documentation. An alias has the form:
+# name=value
+# For example adding
+# "sideeffect=@par Side Effects:^^"
+# will allow you to put the command \sideeffect (or @sideeffect) in the
+# documentation, which will result in a user-defined paragraph with heading
+# "Side Effects:". Note that you cannot put \n's in the value part of an alias
+# to insert newlines (in the resulting output). You can put ^^ in the value part
+# of an alias to insert a newline as if a physical newline was in the original
+# file. When you need a literal { or } or , in the value part of an alias you
+# have to escape them by means of a backslash (\), this can lead to conflicts
+# with the commands \{ and \} for these it is advised to use the version @{ and
+# @} or use a double escape (\\{ and \\})
+
+ALIASES                =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C. For
+# instance, some of the names that are used will be different. The list of all
+# members will be omitted, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or
+# Python sources only. Doxygen will then generate output that is more tailored
+# for that language. For instance, namespaces will be presented as packages,
+# qualified scopes will look different, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources. Doxygen will then generate output that is tailored for Fortran.
+# The default value is: NO.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for VHDL.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension, and
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice,
+# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
+#
+# Note: For files without extension you can use no_extension as a placeholder.
+#
+# Note that for custom extensions you also need to set FILE_PATTERNS otherwise
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
+
+EXTENSION_MAPPING      =
+
+# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
+# according to the Markdown format, which allows for more readable
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you can
+# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
+# case of backward compatibilities issues.
+# The default value is: YES.
+
+MARKDOWN_SUPPORT       = YES
+
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 0
+
+# When enabled doxygen tries to link words that correspond to documented
+# classes, or namespaces to their corresponding documentation. Such a link can
+# be prevented in individual cases by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+# The default value is: YES.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should set this
+# tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string);
+# versus func(std::string) {}). This also make the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+# The default value is: NO.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+# The default value is: NO.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
+# will parse them like normal C++ but will assume all classes use public instead
+# of private inheritance when no explicit protection keyword is present.
+# The default value is: NO.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate
+# getter and setter methods for a property. Setting this option to YES will make
+# doxygen to replace the get and set methods by a property in the documentation.
+# This will only work if the methods are indeed getting or setting a simple
+# type. If this is not the case, or you want to show the methods anyway, you
+# should set this option to NO.
+# The default value is: YES.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+# The default value is: NO.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# If one adds a struct or class to a group and this option is enabled, then also
+# any nested class or struct is added to the same group. By default this option
+# is disabled and one has to add nested compounds explicitly via \ingroup.
+# The default value is: NO.
+
+GROUP_NESTED_COMPOUNDS = NO
+
+# Set the SUBGROUPING tag to YES to allow class member groups of the same type
+# (for instance a group of public functions) to be put as a subgroup of that
+# type (e.g. under the Public Functions section). Set it to NO to prevent
+# subgrouping. Alternatively, this can be done per class using the
+# \nosubgrouping command.
+# The default value is: YES.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions
+# are shown inside the group in which they are included (e.g. using \ingroup)
+# instead of on a separate page (for HTML and Man pages) or section (for LaTeX
+# and RTF).
+#
+# Note that this feature does not work in combination with
+# SEPARATE_MEMBER_PAGES.
+# The default value is: NO.
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions
+# with only public data fields or simple typedef fields will be shown inline in
+# the documentation of the scope in which they are defined (i.e. file,
+# namespace, or group documentation), provided this scope is documented. If set
+# to NO, structs, classes, and unions are shown on a separate page (for HTML and
+# Man pages) or section (for LaTeX and RTF).
+# The default value is: NO.
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or
+# enum is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically be
+# useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+# The default value is: NO.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This
+# cache is used to resolve symbols given their name and scope. Since this can be
+# an expensive process and often the same symbol appears multiple times in the
+# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small
+# doxygen will become slower. If the cache is too large, memory is wasted. The
+# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range
+# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536
+# symbols. At the end of a run doxygen will report the cache usage and suggest
+# the optimal cache size from a speed point of view.
+# Minimum value: 0, maximum value: 9, default value: 0.
+
+LOOKUP_CACHE_SIZE      = 0
+
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which effectively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in
+# documentation are documented, even if no documentation was available. Private
+# class members and static file members will be hidden unless the
+# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES.
+# Note: This will also disable the warnings about undocumented members that are
+# normally produced when WARNINGS is set to YES.
+# The default value is: NO.
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will
+# be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
+# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
+# scope will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be
+# included in the documentation.
+# The default value is: NO.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined
+# locally in source files will be included in the documentation. If set to NO,
+# only classes defined in header files are included. Does not have any effect
+# for Java sources.
+# The default value is: YES.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. If set to YES, local methods,
+# which are defined in the implementation section but not in the interface are
+# included in the documentation. If set to NO, only methods in the interface are
+# included.
+# The default value is: NO.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base name of
+# the file that contains the anonymous namespace. By default anonymous namespace
+# are hidden.
+# The default value is: NO.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
+# undocumented members inside documented classes or files. If set to NO these
+# members will be included in the various overviews, but no documentation
+# section is generated. This option has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy. If set
+# to NO, these classes will be included in the various overviews. This option
+# has no effect if EXTRACT_ALL is enabled.
+# The default value is: NO.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
+# The default value is: NO.
+
+HIDE_FRIEND_COMPOUNDS  = YES
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any
+# documentation blocks found inside the body of a function. If set to NO, these
+# blocks will be appended to the function's detailed documentation block.
+# The default value is: NO.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation that is typed after a
+# \internal command is included. If the tag is set to NO then the documentation
+# will be excluded. Set it to YES to include the internal documentation.
+# The default value is: NO.
+
+INTERNAL_DOCS          = NO
+
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
+# The default value is: system dependent.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with
+# their full class and namespace scopes in the documentation. If set to YES, the
+# scope will be hidden.
+# The default value is: NO.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will
+# append additional text to a page's title, such as Class Reference. If set to
+# YES the compound reference will be hidden.
+# The default value is: NO.
+
+HIDE_COMPOUND_REFERENCE= NO
+
+# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class
+# will show which file needs to be included to use the class.
+# The default value is: YES.
+
+SHOW_HEADERFILE        = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of
+# the files that are included by a file in the documentation of that file.
+# The default value is: YES.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each
+# grouped member an include statement to the documentation, telling the reader
+# which file to include in order to use the member.
+# The default value is: NO.
+
+SHOW_GROUPED_MEMB_INC  = NO
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include
+# files with double quotes in the documentation rather than with sharp brackets.
+# The default value is: NO.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the
+# documentation for inline members.
+# The default value is: YES.
+
+INLINE_INFO            = NO
+
+# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the
+# (detailed) documentation of file and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order.
+# The default value is: YES.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief
+# descriptions of file, namespace and class members alphabetically by member
+# name. If set to NO, the members will appear in declaration order. Note that
+# this will also influence the order of the classes in the class list.
+# The default value is: NO.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the
+# (brief and detailed) documentation of class members so that constructors and
+# destructors are listed first. If set to NO the constructors will appear in the
+# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS.
+# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief
+# member documentation.
+# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting
+# detailed member documentation.
+# The default value is: NO.
+
+SORT_MEMBERS_CTORS_1ST = YES
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy
+# of group names into alphabetical order. If set to NO the group names will
+# appear in their defined order.
+# The default value is: NO.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by
+# fully-qualified names, including namespaces. If set to NO, the class list will
+# be sorted only by class name, not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the alphabetical
+# list.
+# The default value is: NO.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper
+# type resolution of all parameters of a function it will reject a match between
+# the prototype and the implementation of a member function even if there is
+# only one candidate or it is obvious which candidate to choose by doing a
+# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still
+# accept a match between prototype and implementation in such cases.
+# The default value is: NO.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo
+# list. This list is created by putting \todo commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test
+# list. This list is created by putting \test commands in the documentation.
+# The default value is: YES.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug
+# list. This list is created by putting \bug commands in the documentation.
+# The default value is: YES.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO)
+# the deprecated list. This list is created by putting \deprecated commands in
+# the documentation.
+# The default value is: YES.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional documentation
+# sections, marked by \if <section_label> ... \endif and \cond <section_label>
+# ... \endcond blocks.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the
+# initial value of a variable or macro / define can have for it to appear in the
+# documentation. If the initializer consists of more lines than specified here
+# it will be hidden. Use a value of 0 to hide initializers completely. The
+# appearance of the value of individual variables and macros / defines can be
+# controlled using \showinitializer or \hideinitializer command in the
+# documentation regardless of this setting.
+# Minimum value: 0, maximum value: 10000, default value: 30.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at
+# the bottom of the documentation of classes and structs. If set to YES, the
+# list will mention the files that were used to generate the documentation.
+# The default value is: YES.
+
+SHOW_USED_FILES        = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This
+# will remove the Files entry from the Quick Index and from the Folder Tree View
+# (if specified).
+# The default value is: YES.
+
+SHOW_FILES             = NO
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces
+# page. This will remove the Namespaces entry from the Quick Index and from the
+# Folder Tree View (if specified).
+# The default value is: YES.
+
+SHOW_NAMESPACES        = NO
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command command input-file, where command is the value of the
+# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided
+# by doxygen. Whatever the program writes to standard output is used as the file
+# version. For an example see the documentation.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option. You can
+# optionally specify a file name after the option, if omitted DoxygenLayout.xml
+# will be used as the name of the layout file. See also section "Changing the
+# layout of pages" for information.
+#
+# Note that if you run doxygen from a directory containing a file called
+# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
+# tag is left empty.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files containing
+# the reference definitions. This must be a list of .bib files. The .bib
+# extension is automatically appended if omitted. This requires the bibtex tool
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
+# For LaTeX the style of the bibliography can be controlled using
+# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
+# search path. See also \cite for info how to create references.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# Configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated to
+# standard output by doxygen. If QUIET is set to YES this implies that the
+# messages are off.
+# The default value is: NO.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
+# this implies that the warnings are on.
+#
+# Tip: Turn warnings on while writing the documentation.
+# The default value is: YES.
+
+WARNINGS               = YES
+
+# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate
+# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag
+# will automatically be disabled.
+# The default value is: YES.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as documenting some parameters in
+# a documented function twice, or documenting parameters that don't exist or
+# using markup commands wrongly.
+# The default value is: YES.
+
+WARN_IF_DOC_ERROR      = YES
+
+# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete
+# function parameter documentation. If set to NO, doxygen will accept that some
+# parameters have no documentation without warning.
+# The default value is: YES.
+
+WARN_IF_INCOMPLETE_DOC = YES
+
+# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
+# are documented, but have no documentation for their parameters or return
+# value. If set to NO, doxygen will only warn about wrong parameter
+# documentation, but not about the absence of documentation. If EXTRACT_ALL is
+# set to YES then this flag will automatically be disabled. See also
+# WARN_IF_INCOMPLETE_DOC
+# The default value is: NO.
+
+WARN_NO_PARAMDOC       = NO
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that doxygen
+# can produce. The string should contain the $file, $line, and $text tags, which
+# will be replaced by the file and line number from which the warning originated
+# and the warning text. Optionally the format may contain $version, which will
+# be replaced by the version of the file (if it could be obtained via
+# FILE_VERSION_FILTER)
+# The default value is: $file:$line: $text.
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning and error
+# messages should be written. If left blank the output is written to standard
+# error (stderr). In case the file specified cannot be opened for writing the
+# warning and error messages are written to standard error. When as file - is
+# specified the warning and error messages are written to standard output
+# (stdout).
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag is used to specify the files and/or directories that contain
+# documented source files. You may enter file names like myfile.cpp or
+# directories like /usr/src/myproject. Separate the files or directories with
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
+# Note: If this tag is empty the current directory is searched.
+
+INPUT                  = thrust
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
+# libiconv (or the iconv built into libc) for the transcoding. See the libiconv
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
+# The default value is: UTF-8.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and
+# *.h) to filter out the source-files in the directories.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# read by doxygen.
+#
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
+# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
+# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
+# *.hh, *.hxx, *.hpp, *.h++, *.l, *.cs, *.d, *.php, *.php4, *.php5, *.phtml,
+# *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C
+# comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd,
+# *.vhdl, *.ucf, *.qsf and *.ice.
+
+FILE_PATTERNS          =
+
+# The RECURSIVE tag can be used to specify whether or not subdirectories should
+# be searched for input files as well.
+# The default value is: NO.
+
+RECURSIVE              = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+#
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+# The default value is: NO.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       = *detail*
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# ANamespace::AClass, ANamespace::*Test
+#
+# Note that the wildcards are matched against the file with absolute path, so to
+# exclude all test directories use the pattern */test/*
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or directories
+# that contain example code fragments that are included (see the \include
+# command).
+
+EXAMPLE_PATH           = examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and
+# *.h) to filter out the source-files in the directories. If left blank all
+# files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude commands
+# irrespective of the value of the RECURSIVE tag.
+# The default value is: NO.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or directories
+# that contain images that are to be included in the documentation (see the
+# \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command:
+#
+# <filter> <input-file>
+#
+# where <filter> is the value of the INPUT_FILTER tag, and <input-file> is the
+# name of an input file. Doxygen will then use the output that the filter
+# program writes to standard output. If FILTER_PATTERNS is specified, this tag
+# will be ignored.
+#
+# Note that the filter must not add or remove lines; it is applied before the
+# code is scanned, but not when the output code is generated. If lines are added
+# or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis. Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match. The filters are a list of the form: pattern=filter
+# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
+# filters are used. If the FILTER_PATTERNS tag is empty or if none of the
+# patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will also be used to filter the input files that are used for
+# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES).
+# The default value is: NO.
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and
+# it is also possible to disable source filtering for a specific pattern using
+# *.ext= (so without naming a filter).
+# This tag requires that the tag FILTER_SOURCE_FILES is set to YES.
+
+FILTER_SOURCE_PATTERNS =
+
+# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that
+# is part of the input, its contents will be placed on the main page
+# (index.html). This can be useful if you have a project on for instance GitHub
+# and want to reuse the introduction page also for the doxygen output.
+
+USE_MDFILE_AS_MAINPAGE =
+
+#---------------------------------------------------------------------------
+# Configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will be
+# generated. Documented entities will be cross-referenced with these sources.
+#
+# Note: To get rid of all source code in the generated output, make sure that
+# also VERBATIM_HEADERS is set to NO.
+# The default value is: NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body of functions,
+# classes and enums directly into the documentation.
+# The default value is: NO.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any
+# special comment blocks from generated source code fragments. Normal C, C++ and
+# Fortran comments will always remain visible.
+# The default value is: YES.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES then for each documented
+# entity all documented functions referencing it will be listed.
+# The default value is: NO.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES then for each documented function
+# all documented entities called/used by that function will be listed.
+# The default value is: NO.
+
+REFERENCES_RELATION    = YES
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set
+# to YES then the hyperlinks from functions in REFERENCES_RELATION and
+# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will
+# link to the documentation.
+# The default value is: YES.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the
+# source code will show a tooltip with additional information such as prototype,
+# brief description and links to the definition and documentation. Since this
+# will make the HTML file larger and loading of large files a bit slower, you
+# can opt to disable this feature.
+# The default value is: YES.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+SOURCE_TOOLTIPS        = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code will
+# point to the HTML generated by the htags(1) tool instead of doxygen built-in
+# source browser. The htags tool is part of GNU's global source tagging system
+# (see https://www.gnu.org/software/global/global.html). You will need version
+# 4.8.6 or higher.
+#
+# To use it do the following:
+# - Install the latest version of global
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
+# - Make sure the INPUT points to the root of the source tree
+# - Run doxygen as normal
+#
+# Doxygen will invoke htags (and that will in turn invoke gtags), so these
+# tools must be available from the command line (i.e. in the search path).
+#
+# The result: instead of the source browser generated by doxygen, the links to
+# source code will now point to the output of htags.
+# The default value is: NO.
+# This tag requires that the tag SOURCE_BROWSER is set to YES.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a
+# verbatim copy of the header file for each class for which an include is
+# specified. Set to NO to disable this.
+# See also: Section \class.
+# The default value is: YES.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all
+# compounds will be generated. Enable this if the project contains a lot of
+# classes, structs, unions or interfaces.
+# The default value is: YES.
+
+ALPHABETICAL_INDEX     = YES
+
+# In case all classes in a project start with a common prefix, all classes will
+# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
+# can be used to specify a prefix (or a list of prefixes) that should be ignored
+# while generating the index headers.
+# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output
+# The default value is: YES.
+
+GENERATE_HTML          = NO
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_OUTPUT            = build_docs/doxygen/html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each
+# generated HTML page (for example: .htm, .php, .asp).
+# The default value is: .html.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a user-defined HTML header file for
+# each generated HTML page. If the tag is left blank doxygen will generate a
+# standard header.
+#
+# To get valid HTML the header file that includes any scripts and style sheets
+# that doxygen needs, which is dependent on the configuration options used (e.g.
+# the setting GENERATE_TREEVIEW). It is highly recommended to start with a
+# default header using
+# doxygen -w html new_header.html new_footer.html new_stylesheet.css
+# YourConfigFile
+# and then modify the file new_header.html. See also section "Doxygen usage"
+# for information on how to generate the default header that doxygen normally
+# uses.
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. For a description
+# of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each
+# generated HTML page. If the tag is left blank doxygen will generate a standard
+# footer. See HTML_HEADER for more information on how to generate a default
+# footer and what special commands can be used inside the footer. See also
+# section "Doxygen usage" for information on how to generate the default footer
+# that doxygen normally uses.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style
+# sheet that is used by each HTML page. It can be used to fine-tune the look of
+# the HTML output. If left blank doxygen will generate a default style sheet.
+# See also section "Doxygen usage" for information on how to generate the style
+# sheet that doxygen normally uses.
+# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as
+# it is more robust and this tag (HTML_STYLESHEET) will in the future become
+# obsolete.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# cascading style sheets that are included after the standard style sheets
+# created by doxygen. Using this option one can overrule certain style aspects.
+# This is preferred over using HTML_STYLESHEET since it does not replace the
+# standard style sheet and is therefore more robust against future updates.
+# Doxygen will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list). For an example see the documentation.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that the
+# files will be copied as-is; there are no commands or markers available.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
+# will adjust the colors in the style sheet and background images according to
+# this color. Hue is specified as an angle on a color-wheel, see
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
+# purple, and 360 is red again.
+# Minimum value: 0, maximum value: 359, default value: 220.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors
+# in the HTML output. For a value of 0 the output will use gray-scales only. A
+# value of 255 will produce the most vivid colors.
+# Minimum value: 0, maximum value: 255, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the
+# luminance component of the colors in the HTML output. Values below 100
+# gradually make the output lighter, whereas values above 100 make the output
+# darker. The value divided by 100 is the actual gamma applied, so 80 represents
+# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not
+# change the gamma.
+# Minimum value: 40, maximum value: 240, default value: 80.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting this
+# to YES can help to show when doxygen was last run and thus if the
+# documentation is up to date.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries
+# shown in the various tree structured indices initially; the user can expand
+# and collapse entries dynamically later on. Doxygen will expand the tree to
+# such a level that at most the specified number of entries are visible (unless
+# a fully collapsed tree already exceeds this amount). So setting the number of
+# entries 1 will produce a full collapsed tree by default. 0 is a special value
+# representing an infinite number of entries and will result in a full expanded
+# tree by default.
+# Minimum value: 0, maximum value: 9999, default value: 100.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files will be
+# generated that can be used as input for Apple's Xcode 3 integrated development
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_DOCSET        = NO
+
+# This tag determines the name of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# The default value is: Doxygen generated docs.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# This tag determines the URL of the docset feed. A documentation feed provides
+# an umbrella under which multiple documentation sets from a single provider
+# (such as a company or product suite) can be grouped.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_FEEDURL         =
+
+# This tag specifies a string that should uniquely identify the documentation
+# set bundle. This should be a reverse domain-name style string, e.g.
+# com.mycompany.MyDocSet. Doxygen will append .docset to the name.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+# The default value is: org.doxygen.Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher.
+# The default value is: Publisher.
+# This tag requires that the tag GENERATE_DOCSET is set to YES.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
+# additional HTML index files: index.hhp, index.hhc, and index.hhk. The
+# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
+# on Windows. In the beginning of 2021 Microsoft took the original page, with
+# a.o. the download links, offline the HTML help workshop was already many years
+# in maintenance mode). You can download the HTML help workshop from the web
+# archives at Installation executable (see:
+# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo
+# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe).
+#
+# The HTML Help Workshop contains a compiler that can convert all HTML output
+# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
+# files are now used as the Windows 98 help format, and will replace the old
+# Windows help format (.hlp) on all Windows platforms in the future. Compressed
+# HTML files also contain an index, a table of contents, and you can search for
+# words in the documentation. The HTML workshop also contains a viewer for
+# compressed HTML files.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_HTMLHELP      = NO
+
+# The CHM_FILE tag can be used to specify the file name of the resulting .chm
+# file. You can add a path in front of the file if the result should not be
+# written to the html output directory.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_FILE               =
+
+# The HHC_LOCATION tag can be used to specify the location (absolute path
+# including file name) of the HTML help compiler (hhc.exe). If non-empty,
+# doxygen will try to run the HTML help compiler on the generated index.hhp.
+# The file has to be specified with full path.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+HHC_LOCATION           =
+
+# The GENERATE_CHI flag controls if a separate .chi index file is generated
+# (YES) or that it should be included in the main .chm file (NO).
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+GENERATE_CHI           = NO
+
+# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc)
+# and project file content.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+CHM_INDEX_ENCODING     =
+
+# The BINARY_TOC flag controls whether a binary table of contents is generated
+# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it
+# enables the Previous and Next buttons.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members to
+# the table of contents of the HTML help documentation and to the tree view.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTMLHELP is set to YES.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that
+# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help
+# (.qch) of the generated HTML documentation.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify
+# the file name of the resulting .qch file. The path specified is relative to
+# the HTML output folder.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
+# Project output. For more information please see Qt Help Project / Namespace
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
+# Help Project output. For more information please see Qt Help Project / Virtual
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
+# The default value is: doc.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
+# filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see Qt Help Project / Custom
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's filter section matches. Qt Help Project / Filter Attributes (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
+# This tag requires that the tag GENERATE_QHP is set to YES.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be
+# generated, together with the HTML files, they form an Eclipse help plugin. To
+# install this plugin and make it available under the help contents menu in
+# Eclipse, the contents of the directory containing the HTML and XML files needs
+# to be copied into the plugins directory of eclipse. The name of the directory
+# within the plugins directory should be the same as the ECLIPSE_DOC_ID value.
+# After copying Eclipse needs to be restarted before the help appears.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the Eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have this
+# name. Each documentation set should have its own identifier.
+# The default value is: org.doxygen.Project.
+# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# If you want full control over the layout of the generated HTML pages it might
+# be necessary to disable the index and replace it with your own. The
+# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top
+# of each HTML page. A value of NO enables the index and the value YES disables
+# it. Since the tabs in the index contain the same information as the navigation
+# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+DISABLE_INDEX          = YES
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information. If the tag
+# value is set to YES, a side panel will be generated containing a tree-like
+# index structure (just like the one that is generated for HTML Help). For this
+# to work a browser that supports JavaScript, DHTML, CSS and frames is required
+# (i.e. any modern browser). Windows users are probably better off using the
+# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can
+# further fine tune the look of the index (see "Fine-tuning the output"). As an
+# example, the default style sheet generated by doxygen has an example that
+# shows how to put an image at the root of the tree instead of the PROJECT_NAME.
+# Since the tree basically has the same information as the tab index, you could
+# consider setting DISABLE_INDEX to YES when enabling this option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+GENERATE_TREEVIEW      = NO
+
+# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the
+# FULL_SIDEBAR option determines if the side bar is limited to only the treeview
+# area (value NO) or if it should extend to the full height of the window (value
+# YES). Setting this to YES gives a layout similar to
+# https://docs.readthedocs.io with more room for contents, but less room for the
+# project logo, title, and description. If either GENERATE_TREEVIEW or
+# DISABLE_INDEX is set to NO, this option has no effect.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FULL_SIDEBAR           = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that
+# doxygen will group on one line in the generated HTML documentation.
+#
+# Note that a value of 0 will completely suppress the enum values from appearing
+# in the overview section.
+# Minimum value: 0, maximum value: 20, default value: 4.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used
+# to set the initial width (in pixels) of the frame in which the tree is shown.
+# Minimum value: 0, maximum value: 1500, default value: 250.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+TREEVIEW_WIDTH         = 250
+
+# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to
+# external symbols imported via tag files in a separate window.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
+# Use this tag to change the font size of LaTeX formulas included as images in
+# the HTML documentation. When you change the font size after a successful
+# doxygen run you need to manually remove any form_*.png images from the HTML
+# output directory to force them to be regenerated.
+# Minimum value: 8, maximum value: 50, default value: 10.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are not
+# supported properly for IE 6.0, but are supported on all modern browsers.
+#
+# Note that when changing this option you need to delete any form_*.png files in
+# the HTML output directory before the changes have effect.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+FORMULA_TRANSPARENT    = YES
+
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
+# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
+# installed or if you want to formulas look prettier in the HTML output. When
+# enabled you may also need to install MathJax separately and configure the path
+# to it using the MATHJAX_RELPATH option.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+USE_MATHJAX            = NO
+
+# With MATHJAX_VERSION it is possible to specify the MathJax version to be used.
+# Note that the different versions of MathJax have different requirements with
+# regards to the different settings, so it is possible that also other MathJax
+# settings have to be changed when switching between the different MathJax
+# versions.
+# Possible values are: MathJax_2 and MathJax_3.
+# The default value is: MathJax_2.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_VERSION        = MathJax_2
+
+# When MathJax is enabled you can set the default output format to be used for
+# the MathJax output. For more details about the output format see MathJax
+# version 2 (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3
+# (see:
+# http://docs.mathjax.org/en/latest/web/components/output.html).
+# Possible values are: HTML-CSS (which is slower, but has the best
+# compatibility. This is the name for Mathjax version 2, for MathJax version 3
+# this will be translated into chtml), NativeMML (i.e. MathML. Only supported
+# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This
+# is the name for Mathjax version 3, for MathJax version 2 this will be
+# translated into HTML-CSS) and SVG.
+# The default value is: HTML-CSS.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_FORMAT         = HTML-CSS
+
+# When MathJax is enabled you need to specify the location relative to the HTML
+# output directory using the MATHJAX_RELPATH option. The destination directory
+# should contain the MathJax.js script. For instance, if the mathjax directory
+# is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
+# Content Delivery Network so you can quickly see the result without installing
+# MathJax. However, it is strongly recommended to install a local copy of
+# MathJax from https://www.mathjax.org before deployment. The default value is:
+# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2
+# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax
+# extension names that should be enabled during MathJax rendering. For example
+# for MathJax version 2 (see https://docs.mathjax.org/en/v2.7-latest/tex.html
+# #tex-and-latex-extensions):
+# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols
+# For example for MathJax version 3 (see
+# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html):
+# MATHJAX_EXTENSIONS = ams
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_EXTENSIONS     =
+
+# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
+# of code that will be used on startup of the MathJax code. See the MathJax site
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
+# example see the documentation.
+# This tag requires that the tag USE_MATHJAX is set to YES.
+
+MATHJAX_CODEFILE       =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box for
+# the HTML output. The underlying search engine uses javascript and DHTML and
+# should work on any modern browser. Note that when using HTML help
+# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET)
+# there is already a search function so this one should typically be disabled.
+# For large projects the javascript based search engine can be slow, then
+# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to
+# search using the keyboard; to jump to the search box use <access key> + S
+# (what the <access key> is depends on the OS and browser, but it is typically
+# <CTRL>, <ALT>/<option>, or both). Inside the search box use the <cursor down
+# key> to jump into the search results window, the results can be navigated
+# using the <cursor keys>. Press <Enter> to select an item or <escape> to cancel
+# the search. The filter options can be selected when the cursor is inside the
+# search box by pressing <Shift>+<cursor down>. Also here use the <cursor keys>
+# to select a filter and <Enter> or <escape> to activate or cancel the filter
+# option.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+SEARCHENGINE           = NO
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a web server instead of a web client using JavaScript. There
+# are two flavors of web server based searching depending on the EXTERNAL_SEARCH
+# setting. When disabled, doxygen will generate a PHP script for searching and
+# an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
+# and searching needs to be provided by external tools. See the section
+# "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SERVER_BASED_SEARCH    = NO
+
+# When EXTERNAL_SEARCH tag is enabled doxygen will no longer generate the PHP
+# script for searching. Instead the search results are written to an XML file
+# which needs to be processed by an external indexer. Doxygen will invoke an
+# external search engine pointed to by the SEARCHENGINE_URL option to obtain the
+# search results.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/).
+#
+# See the section "External Indexing and Searching" for details.
+# The default value is: NO.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH        = NO
+
+# The SEARCHENGINE_URL should point to a search engine hosted by a web server
+# which will return the search results when EXTERNAL_SEARCH is enabled.
+#
+# Doxygen ships with an example indexer (doxyindexer) and search engine
+# (doxysearch.cgi) which are based on the open source search engine library
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHENGINE_URL       =
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed
+# search data is written to a file for indexing by an external tool. With the
+# SEARCHDATA_FILE tag the name of this file can be specified.
+# The default file is: searchdata.xml.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+SEARCHDATA_FILE        = searchdata.xml
+
+# When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the
+# EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is
+# useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple
+# projects and redirect the results back to the right project.
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTERNAL_SEARCH_ID     =
+
+# The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen
+# projects other than the one defined by this configuration file, but that are
+# all added to the same external search index. Each project needs to have a
+# unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id of
+# to a relative location where the documentation can be found. The format is:
+# EXTRA_SEARCH_MAPPINGS = tagname1=loc1 tagname2=loc2 ...
+# This tag requires that the tag SEARCHENGINE is set to YES.
+
+EXTRA_SEARCH_MAPPINGS  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output.
+# The default value is: YES.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: latex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked.
+#
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_CMD_NAME         =
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
+# index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
+# The default file is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
+# If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used by the
+# printer.
+# Possible values are: a4 (210 x 297 mm), letter (8.5 x 11 inches), legal (8.5 x
+# 14 inches) and executive (7.25 x 10.5 inches).
+# The default value is: a4.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be used to specify one or more LaTeX package names
+# that should be included in the LaTeX output. The package can be specified just
+# by its name or with the correct syntax as to be used with the LaTeX
+# \usepackage command. To get the times font for instance you can specify :
+# EXTRA_PACKAGES=times or EXTRA_PACKAGES={times}
+# To use the option intlimits with the amsmath package you can specify:
+# EXTRA_PACKAGES=[intlimits]{amsmath}
+# If left blank no extra packages will be included.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for
+# the generated LaTeX document. The header should contain everything until the
+# first chapter. If it is left blank doxygen will generate a standard header. It
+# is highly recommended to start with a default header using
+# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty
+# and then modify the file new_header.tex. See also section "Doxygen usage" for
+# information on how to generate the default header that doxygen normally uses.
+#
+# Note: Only use a user-defined header if you know what you are doing!
+# Note: The header is subject to change so you typically have to regenerate the
+# default header when upgrading to a newer version of doxygen. The following
+# commands have a special meaning inside the header (and footer): For a
+# description of the possible markers and block names see the documentation.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for
+# the generated LaTeX document. The footer should contain everything after the
+# last chapter. If it is left blank doxygen will generate a standard footer. See
+# LATEX_HEADER for more information on how to generate a default footer and what
+# special commands can be used inside the footer. See also section "Doxygen
+# usage" for information on how to generate the default footer that doxygen
+# normally uses. Note: Only use a user-defined footer if you know what you are
+# doing!
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_FOOTER           =
+
+# The LATEX_EXTRA_STYLESHEET tag can be used to specify additional user-defined
+# LaTeX style sheets that are included after the standard style sheets created
+# by doxygen. Using this option one can overrule certain style aspects. Doxygen
+# will copy the style sheet files to the output directory.
+# Note: The order of the extra style sheet files is of importance (e.g. the last
+# style sheet in the list overrules the setting of the previous ones in the
+# list).
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_STYLESHEET =
+
+# The LATEX_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the LATEX_OUTPUT output
+# directory. Note that the files will be copied as-is; there are no commands or
+# markers available.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EXTRA_FILES      =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated is
+# prepared for conversion to PDF (using ps2pdf or pdflatex). The PDF file will
+# contain links (just like the HTML output) instead of page references. This
+# makes the output suitable for online browsing using a PDF viewer.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode
+# command to the generated LaTeX files. This will instruct LaTeX to keep running
+# if errors occur, instead of asking the user for help.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BATCHMODE        = NO
+
+# If the LATEX_HIDE_INDICES tag is set to YES then doxygen will not include the
+# index chapters (such as File Index, Compound Index, etc.) in the output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_HIDE_INDICES     = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. See
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# The default value is: plain.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_BIB_STYLE        = plain
+
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES, doxygen will generate RTF output. The
+# RTF output is optimized for Word 97 and may not look too pretty with other RTF
+# readers/editors.
+# The default value is: NO.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: rtf.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES, doxygen generates more compact RTF
+# documents. This may be useful for small projects and may help to save some
+# trees in general.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated will
+# contain hyperlink fields. The RTF file will contain links (just like the HTML
+# output) instead of page references. This makes the output suitable for online
+# browsing using Word or some other Word compatible readers that support those
+# fields.
+#
+# Note: WordPad (write) and others do not support links.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_HYPERLINKS         = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+#
+# See also section "Doxygen usage" for information on how to generate the
+# default style sheet that doxygen normally uses.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an RTF document. Syntax is
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
+# This tag requires that the tag GENERATE_RTF is set to YES.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES, doxygen will generate man pages for
+# classes and files.
+# The default value is: NO.
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it. A directory man3 will be created inside the directory specified by
+# MAN_OUTPUT.
+# The default directory is: man.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to the generated
+# man pages. In case the manual section does not start with a number, the number
+# 3 is prepended. The dot (.) at the beginning of the MAN_EXTENSION tag is
+# optional.
+# The default value is: .3.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_EXTENSION          = .3
+
+# The MAN_SUBDIR tag determines the name of the directory created within
+# MAN_OUTPUT in which the man pages are placed. If defaults to man followed by
+# MAN_EXTENSION with the initial . removed.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_SUBDIR             =
+
+# If the MAN_LINKS tag is set to YES and doxygen generates man output, then it
+# will generate one additional man file for each entity documented in the real
+# man page(s). These additional files only source the real man page, but without
+# them the man command would be unable to find the correct page.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_MAN is set to YES.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES, doxygen will generate an XML file that
+# captures the structure of the code including all documentation.
+# The default value is: NO.
+
+GENERATE_XML           = YES
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put. If a
+# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of
+# it.
+# The default directory is: xml.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_OUTPUT             = build_docs/doxygen/xml
+
+# If the XML_PROGRAMLISTING tag is set to YES, doxygen will dump the program
+# listings (including syntax highlighting and cross-referencing information) to
+# the XML output. Note that enabling this will significantly increase the size
+# of the XML output.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_PROGRAMLISTING     = YES
+
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the DOCBOOK output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_DOCBOOK tag is set to YES, doxygen will generate Docbook files
+# that can be used to generate PDF.
+# The default value is: NO.
+
+GENERATE_DOCBOOK       = NO
+
+# The DOCBOOK_OUTPUT tag is used to specify where the Docbook pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be put in
+# front of it.
+# The default directory is: docbook.
+# This tag requires that the tag GENERATE_DOCBOOK is set to YES.
+
+DOCBOOK_OUTPUT         = docbook
+
+#---------------------------------------------------------------------------
+# Configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES, doxygen will generate a Perl module
+# file that captures the structure of the code including all documentation.
+#
+# Note that this feature is still experimental and incomplete at the moment.
+# The default value is: NO.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES, doxygen will generate the necessary
+# Makefile rules, Perl scripts and LaTeX code to be able to generate PDF and DVI
+# output from the Perl module output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES, the Perl module output will be nicely
+# formatted so it can be parsed by a human reader. This is useful if you want to
+# understand what is going on. On the other hand, if this tag is set to NO, the
+# size of the Perl module output will be much smaller and Perl will parse it
+# just the same.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file are
+# prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. This is useful
+# so different doxyrules.make files included by the same Makefile don't
+# overwrite each other's variables.
+# This tag requires that the tag GENERATE_PERLMOD is set to YES.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES, doxygen will evaluate all
+# C-preprocessor directives found in the sources and include files.
+# The default value is: YES.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES, doxygen will expand all macro names
+# in the source code. If set to NO, only conditional compilation will be
+# performed. Macro expansion can be done in a controlled way by setting
+# EXPAND_ONLY_PREDEF to YES.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
+# the macro expansion is limited to the macros specified with the PREDEFINED and
+# EXPAND_AS_DEFINED tags.
+# The default value is: NO.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES, the include files in the
+# INCLUDE_PATH will be searched if a #include is found.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SEARCH_INCLUDES        = NO
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by the
+# preprocessor.
+# This tag requires that the tag SEARCH_INCLUDES is set to YES.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will be
+# used.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that are
+# defined before the preprocessor is started (similar to the -D option of e.g.
+# gcc). The argument of the tag is a list of macros of the form: name or
+# name=definition (no spaces). If the definition and the "=" are omitted, "=1"
+# is assumed. To prevent a macro definition from being undefined via #undef or
+# recursively expanded use the := operator instead of the = operator.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+PREDEFINED             = THRUST_DOXYGEN \
+                         THRUST_CPP_DIALECT=2017 \
+                         THRUST_NODISCARD=[[nodiscard]] \
+                         THRUST_MR_DEFAULT_ALIGNMENT=alignof(std::max_align_t) \
+                         "THRUST_NAMESPACE_BEGIN=namespace thrust {" \
+                         THRUST_NAMESPACE_END=}
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
+# tag can be used to specify a list of macro names that should be expanded. The
+# macro definition that is found in the sources will be used. Use the PREDEFINED
+# tag if you want to use a different macro definition that overrules the
+# definition found in the source code.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES then doxygen's preprocessor will
+# remove all references to function-like macros that are alone on a line, have
+# an all uppercase name, and do not end with a semicolon. Such function macros
+# are typically used for boiler-plate code, and will confuse the parser if not
+# removed.
+# The default value is: YES.
+# This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration options related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tag files. For each tag
+# file the location of the external documentation should be added. The format of
+# a tag file without this location is as follows:
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where loc1 and loc2 can be relative or absolute paths or URLs. See the
+# section "Linking to external documentation" for more information about the use
+# of tag files.
+# Note: Each tag file must have a unique name (where the name does NOT include
+# the path). If a tag file is not located in the directory in which doxygen is
+# run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create a
+# tag file that is based on the input files it reads. See section "Linking to
+# external documentation" for more information about the usage of tag files.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES, all external class will be listed in
+# the class index. If set to NO, only the inherited external classes will be
+# listed.
+# The default value is: NO.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will be
+# listed.
+# The default value is: YES.
+
+EXTERNAL_GROUPS        = YES
+
+# If the EXTERNAL_PAGES tag is set to YES, all external pages will be listed in
+# the related pages index. If set to NO, only the current project's pages will
+# be listed.
+# The default value is: YES.
+
+EXTERNAL_PAGES         = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# You can include diagrams made with dia in doxygen documentation. Doxygen will
+# then run dia to produce the diagram and insert it in the documentation. The
+# DIA_PATH tag allows you to specify the directory where the dia binary resides.
+# If left empty dia is assumed to be found in the default search path.
+
+DIA_PATH               =
+
+# If set to YES the inheritance and collaboration graphs will hide inheritance
+# and usage relations if the target is undocumented or is not a class.
+# The default value is: YES.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz (see:
+# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent
+# Bell Labs. The other options in this section have no effect if this option is
+# set to NO
+# The default value is: NO.
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is allowed
+# to run in parallel. When set to 0 doxygen will base this on the number of
+# processors available in the system. You can set it explicitly to a value
+# larger than 0 to get control over the balance between CPU load and processing
+# speed.
+# Minimum value: 0, maximum value: 32, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_NUM_THREADS        = 0
+
+# When you want a differently looking font in the dot files that doxygen
+# generates you can specify the font name using DOT_FONTNAME. You need to make
+# sure dot is able to find the font, which can be done by putting it in a
+# standard location or by setting the DOTFONTPATH environment variable or by
+# setting DOT_FONTPATH to the directory containing the font.
+# The default value is: Helvetica.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of
+# dot graphs.
+# Minimum value: 4, maximum value: 24, default value: 10.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the default font as specified with
+# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set
+# the path where dot can find it using this tag.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
+
+CLASS_GRAPH            = NO
+
+# If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a
+# graph for each documented class showing the direct and indirect implementation
+# dependencies (inheritance, containment, and class references variables) of the
+# class with other documented classes.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+COLLABORATION_GRAPH    = NO
+
+# If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for
+# groups, showing the direct groups dependencies.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GROUP_GRAPHS           = NO
+
+# If the UML_LOOK tag is set to YES, doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside the
+# class node. If there are many fields or methods and many nodes the graph may
+# become too big to be useful. The UML_LIMIT_NUM_FIELDS threshold limits the
+# number of items for each type to make the size more manageable. Set this to 0
+# for no limit. Note that the threshold may be exceeded by 50% before the limit
+# is enforced. So when you set the threshold to 10, up to 15 fields may appear,
+# but if the number exceeds 15, the total amount of fields shown is limited to
+# 10.
+# Minimum value: 0, maximum value: 100, default value: 10.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
+# If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
+# collaboration graphs will show the relations between templates and their
+# instances.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to
+# YES then doxygen will generate a graph for each documented file showing the
+# direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDE_GRAPH          = NO
+
+# If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are
+# set to YES then doxygen will generate a graph for each documented file showing
+# the direct and indirect include dependencies of the file with other documented
+# files.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INCLUDED_BY_GRAPH      = NO
+
+# If the CALL_GRAPH tag is set to YES then doxygen will generate a call
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable call graphs for selected
+# functions only using the \callgraph command. Disabling a call graph can be
+# accomplished by means of the command \hidecallgraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH tag is set to YES then doxygen will generate a caller
+# dependency graph for every global function or class method.
+#
+# Note that enabling this option will significantly increase the time of a run.
+# So in most cases it will be better to enable caller graphs for selected
+# functions only using the \callergraph command. Disabling a caller graph can be
+# accomplished by means of the command \hidecallergraph.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY tag is set to YES then doxygen will graphical
+# hierarchy of all classes instead of a textual one.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GRAPHICAL_HIERARCHY    = NO
+
+# If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the
+# dependencies a directory has on other directories in a graphical way. The
+# dependency relations are determined by the #include relations between the
+# files in the directories.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DIRECTORY_GRAPH        = NO
+
+# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels
+# of child directories generated in directory dependency graphs by dot.
+# Minimum value: 1, maximum value: 25, default value: 1.
+# This tag requires that the tag DIRECTORY_GRAPH is set to YES.
+
+DIR_GRAPH_MAX_DEPTH    = 1
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. For an explanation of the image formats see the section
+# output formats in the documentation of the dot tool (Graphviz (see:
+# http://www.graphviz.org/)).
+# Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order
+# to make the SVG files visible in IE 9+ (other browsers do not have this
+# requirement).
+# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo,
+# png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and
+# png:gdiplus:gdiplus.
+# The default value is: png.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+#
+# Note that this requires a modern browser other than Internet Explorer. Tested
+# and working are Firefox, Chrome, Safari, and Opera.
+# Note: For IE 9+ you need to set HTML_FILE_EXTENSION to xhtml in order to make
+# the SVG files visible. Older versions of IE do not have SVG support.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+INTERACTIVE_SVG        = NO
+
+# The DOT_PATH tag can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the \dotfile
+# command).
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the \mscfile
+# command).
+
+MSCFILE_DIRS           =
+
+# The DIAFILE_DIRS tag can be used to specify one or more directories that
+# contain dia files that are included in the documentation (see the \diafile
+# command).
+
+DIAFILE_DIRS           =
+
+# When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the
+# path where java can find the plantuml.jar file or to the filename of jar file
+# to be used. If left blank, it is assumed PlantUML is not used or called during
+# a preprocessing step. Doxygen will generate a warning when it encounters a
+# \startuml command in this case and will not generate output for the diagram.
+
+PLANTUML_JAR_PATH      =
+
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
+# When using plantuml, the specified paths are searched for files specified by
+# the !include statement in a plantuml block.
+
+PLANTUML_INCLUDE_PATH  =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of nodes
+# that will be shown in the graph. If the number of nodes in a graph becomes
+# larger than this value, doxygen will truncate the graph, which is visualized
+# by representing a node as a red box. Note that doxygen if the number of direct
+# children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note that
+# the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+# Minimum value: 0, maximum value: 10000, default value: 50.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the graphs
+# generated by dot. A depth value of 3 means that only nodes reachable from the
+# root by following a path via at most 3 edges will be shown. Nodes that lay
+# further from the root node will be omitted. Note that setting this option to 1
+# or 2 may greatly reduce the computation time needed for large code bases. Also
+# note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+# Minimum value: 0, maximum value: 1000, default value: 0.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not seem
+# to support this out of the box.
+#
+# Warning: Depending on the platform used, enabling this option may lead to
+# badly anti-aliased labels on the edges of a graph (i.e. they become hard to
+# read).
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10) support
+# this, this feature is disabled by default.
+# The default value is: NO.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page
+# explaining the meaning of the various boxes and arrows in the dot generated
+# graphs.
+# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal
+# graphical representation for inheritance and collaboration diagrams is used.
+# The default value is: YES.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
+# files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc temporary
+# files.
+# The default value is: YES.
+
+DOT_CLEANUP            = YES
diff --git a/docs/generate_markdown.bash b/docs/generate_markdown.bash
new file mode 100755
index 000000000..3b711db10
--- /dev/null
+++ b/docs/generate_markdown.bash
@@ -0,0 +1,106 @@
+#! /usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2018-2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+set -e
+
+function usage {
+  echo "Usage: ${0} [flags...]"
+  echo
+  echo "Generate Thrust documentation markdown with Doxygen and Doxybook that "
+  echo "can be served with Jekyll."
+  echo
+  echo "-h, -help, --help"
+  echo "  Print this message."
+  echo
+  echo "-c, --clean"
+  echo "  Delete the all existing build artifacts before generating the "
+  echo "  markdown."
+
+  exit -3
+}
+
+LOCAL=0
+CLEAN=0
+
+while test ${#} != 0
+do
+  case "${1}" in
+  -h) ;&
+  -help) ;&
+  --help) usage ;;
+  -c) ;&
+  --clean) CLEAN=1 ;;
+  esac
+  shift
+done
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPO_PATH=${SCRIPT_PATH}/..
+
+BUILD_DOCS_PATH=build_docs
+BUILD_DOXYGEN_PATH=${BUILD_DOCS_PATH}/doxygen
+BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages
+
+cd ${REPO_PATH}
+
+if [[ "${CLEAN}" == 1 ]]; then
+  rm -rf ${BUILD_DOXYGEN_PATH}
+  rm -rf ${BUILD_GITHUB_PAGES_PATH}
+fi
+
+mkdir -p ${BUILD_DOXYGEN_PATH}/xml
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/api
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/contributing
+mkdir -p ${BUILD_GITHUB_PAGES_PATH}/releases
+
+# Copy all the documentation sources and Jekyll configuration into
+# `{BUILD_GITHUB_PAGES_PATH}`.
+cp -ur docs/github_pages/* ${BUILD_GITHUB_PAGES_PATH}/
+cp README.md               ${BUILD_GITHUB_PAGES_PATH}/overview.md
+cp CODE_OF_CONDUCT.md      ${BUILD_GITHUB_PAGES_PATH}/contributing/code_of_conduct.md
+cp CHANGELOG.md            ${BUILD_GITHUB_PAGES_PATH}/releases/changelog.md
+
+doxygen docs/doxygen/config.dox
+
+# `--debug-templates` will cause JSON output to be generated, which is useful
+# for debugging.
+doxybook2 --config docs/doxybook/config.json  \
+          --templates docs/doxybook/templates \
+          --debug-templates                   \
+          --input ${BUILD_DOXYGEN_PATH}/xml   \
+          --output ${BUILD_GITHUB_PAGES_PATH}/api
+
+# Doxygen and Doxybook don't give us a way to disable all the things we'd like,
+# so it's important to purge Doxybook Markdown output that we don't need:
+# 0) We want our Jekyll build to be as fast as possible and avoid wasting time
+#    on stuff we don't need.
+# 1) We don't want content that we don't plan to use to either show up on the
+#    site index or appear in search results.
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/files
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_files.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/pages
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_pages.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/examples
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_examples.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/images
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_namespaces.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_groups.md
+rm -rf ${BUILD_GITHUB_PAGES_PATH}/api/index_classes.md
+
diff --git a/docs/github_pages/Gemfile b/docs/github_pages/Gemfile
new file mode 100644
index 000000000..09d948e17
--- /dev/null
+++ b/docs/github_pages/Gemfile
@@ -0,0 +1,10 @@
+source "https://rubygems.org"
+gem "just-the-docs"
+group :jekyll_plugins do
+  gem "github-pages"                 # GitHub Pages.
+  gem "jekyll-optional-front-matter" # GitHub Pages.
+  gem "jekyll-default-layout"        # GitHub Pages.
+  gem "jekyll-titles-from-headings"  # GitHub Pages.
+  gem "jekyll-relative-links"        # GitHub Pages.
+  gem "jekyll-include-cache"
+end
diff --git a/docs/github_pages/_config.yml b/docs/github_pages/_config.yml
new file mode 100644
index 000000000..c131e84fb
--- /dev/null
+++ b/docs/github_pages/_config.yml
@@ -0,0 +1,47 @@
+title: Thrust
+
+repository: nvidia/thrust
+
+remote_theme: pmarsceill/just-the-docs
+
+color_scheme: nvidia
+logo: /assets/images/nvidia_logo.png
+
+search_enabled: true
+search.heading_level: 4
+
+incremental: true
+
+# just-the-docs ignores these filenames by default.
+include: [ "contributing.md", "code_of_conduct.md" ]
+
+exclude: [ "node_modules", "doxybook_templates",
+           "generate_markdown.bash", "serve_docs_locally.bash" ]
+
+plugins:
+  - jekyll-optional-front-matter # GitHub Pages.
+  - jekyll-default-layout        # GitHub Pages.
+  - jekyll-titles-from-headings  # GitHub Pages.
+  - jekyll-relative-links        # GitHub Pages.
+  - jekyll-include-cache
+
+defaults:
+  -
+    scope:
+      path: overview.md
+    values:
+      title: Overview
+      nav_order: 0
+      permalink: /
+  -
+    scope:
+      path: contributing/code_of_conduct.md
+    values:
+      parent: Contributing
+      nav_order: 2
+  -
+    scope:
+      path: releases/changelog.md
+    values:
+      parent: Releases
+      nav_order: 0
diff --git a/docs/github_pages/_sass/color_schemes/nvidia.scss b/docs/github_pages/_sass/color_schemes/nvidia.scss
new file mode 100644
index 000000000..4b44fa222
--- /dev/null
+++ b/docs/github_pages/_sass/color_schemes/nvidia.scss
@@ -0,0 +1,145 @@
+$body-line-height: 1.4;
+$content-line-height: 1.4;
+.highlight { line-height: 1.0 !important; }
+
+/* h1 size. We make this smaller so the README title fits on one line. */
+$font-size-9: 30px;
+
+/* Inline code. */
+code,
+code.highlighter-rouge
+{ font-size: 0.85em !important; }
+
+/* Code blocks. */
+pre.highlight code { font-size: 0.9em !important; }
+
+/* Doxybook generated code snippets. */
+code.doxybook { display: block; }
+
+/* Line wrap with an indent of four characters in Doxybook-generated code snippets. */
+code.doxybook span
+{ display: block; text-indent: -4ex !important; padding-left: 4ex !important; }
+
+/* Line wrap with an indent of eight characters in Doxybook-generated code snippets. */
+code.doxybook span
+{ display: block; text-indent: -8ex !important; padding-left: 8ex !important; }
+
+/* Disable line wrap for indent <span>s. */
+code.doxybook
+{ display: block; text-indent: 0ex !important; padding-left: 0ex !important; }
+
+h3 { margin-bottom: 1.0em !important; }
+
+$nav-width: 300px;
+
+$body-background-color: $grey-dk-300;
+$sidebar-color: $grey-dk-300;
+$border-color: $grey-dk-200;
+
+$body-text-color: $grey-lt-300;
+$body-heading-color: $grey-lt-000;
+$nav-child-link-color: $grey-dk-000;
+$search-result-preview-color: $grey-dk-000;
+
+$link-color: #76b900;
+$btn-primary-color: #76b900;
+$base-button-color: $grey-dk-250;
+
+$code-background-color: $grey-dk-250;
+$search-background-color: $grey-dk-250;
+$table-background-color: $grey-dk-250;
+$feedback-color: darken($sidebar-color, 3%);
+
+div.highlighter-rouge,
+pre.highlight code,
+code.doxybook
+{ background-color: #111 !important; }
+
+span.doxybook-comment code
+{ background-color: #111 !important; border: none !important; }
+
+.highlight span.err { color: #ff0000; font-weight: bold; } /* Error */
+
+.highlight span.ow, /* Operator.Word */
+.highlight span.k,  /* Keyword */
+.highlight span.kc, /* Keyword.Constant */
+.highlight span.kd, /* Keyword.Declaration */
+.highlight span.kp, /* Keyword.Pseudo */
+.highlight span.kr, /* Keyword.Reserved */
+.highlight span.bp, /* Name.Builtin.Pseudo */
+.highlight span.vc, /* Name.Variable.Class */
+.highlight span.vg, /* Name.Variable.Global */
+.highlight span.vi  /* Name.Variable.Instance */
+{ color: #76b900; font-weight: bold; }
+
+.highlight span.n,  /* Name */
+.highlight span.h,  /* Name */
+.highlight span.na, /* Name.Attribute */
+.highlight span.nb, /* Name.Builtin */
+.highlight span.nc, /* Name.Class */
+.highlight span.no, /* Name.Constant */
+.highlight span.nd, /* Name.Decorator */
+.highlight span.ni, /* Name.Entity */
+.highlight span.ne, /* Name.Exception */
+.highlight span.nf, /* Name.Function */
+.highlight span.nl, /* Name.Label */
+.highlight span.nn, /* Name.Namespace */
+.highlight span.nx, /* Name.Other */
+.highlight span.py, /* Name.Property */
+.highlight span.nt, /* Name.Tag */
+.highlight span.nv, /* Name.Variable */
+.highlight span.kt  /* Keyword.Type */
+{ color: $grey-lt-300 }
+
+.highlight span.c,  /* Comment */
+.highlight span.cm, /* Comment.Multiline */
+.highlight span.c1, /* Comment.Single */
+.highlight span.cs, /* Comment.Special */
+span.doxybook-comment
+{ color: #009966; font-family: $body-font-family; font-style: italic; }
+
+.highlight span.cp  /* Preprocessor */
+.highlight span.kn, /* Keyword.Namespace */
+{ color: $grey-dk-000 }
+
+.highlight span.o, /* Operator */
+.highlight span.p  /* Punctuation */
+{ color: #00ff00; }
+
+.highlight span.ge { font-style: italic; } /* Generic.Emph */
+
+.highlight span.gs { font-weight: bold; } /* Generic.Strong */
+
+.highlight span.l,  /* Literal */
+.highlight span.ld, /* Literal.Date */
+.highlight span.m,  /* Literal.Number */
+.highlight span.mf, /* Literal.Number.Float */
+.highlight span.mh, /* Literal.Number.Hex */
+.highlight span.mi, /* Literal.Number.Integer */
+.highlight span.mo, /* Literal.Number.Oct */
+.highlight span.il, /* Literal.Number.Integer.Long */
+.highlight span.s,  /* Literal.String */
+.highlight span.sb, /* Literal.String.Backtick */
+.highlight span.sc, /* Literal.String.Char */
+.highlight span.sd, /* Literal.String.Doc */
+.highlight span.s2, /* Literal.String.Double */
+.highlight span.se, /* Literal.String.Escape */
+.highlight span.sh, /* Literal.String.Heredoc */
+.highlight span.si, /* Literal.String.Interpol */
+.highlight span.sx, /* Literal.String.Other */
+.highlight span.sr, /* Literal.String.Regex */
+.highlight span.s1, /* Literal.String.Single */
+.highlight span.ss  /* Literal.String.Symbol */
+{ color: #119911; }
+
+.highlight span.w { color: #00cc00; } /* Text.Whitespace */
+
+.highlight span.gh, /* Generic.Heading */
+.highlight span.gp, /* Generic.Prompt */
+.highlight span.gu  /* Generic.Subheading */
+{ color: #00ff00; font-weight: bold; }
+
+.highlight span.gd { color: #ff0000; } /* Generic.Deleted */
+.highlight span.gi { color: #00ff00; } /* Generic.Inserted */
+
+.search-input { color: $body-text-color; }
diff --git a/docs/github_pages/api.md b/docs/github_pages/api.md
new file mode 100644
index 000000000..6a2d1af43
--- /dev/null
+++ b/docs/github_pages/api.md
@@ -0,0 +1,8 @@
+---
+has_children: true
+has_toc: true
+nav_order: 2
+---
+
+# API
+
diff --git a/docs/github_pages/assets/images/nvidia_logo.png b/docs/github_pages/assets/images/nvidia_logo.png
new file mode 100644
index 000000000..6b005a283
Binary files /dev/null and b/docs/github_pages/assets/images/nvidia_logo.png differ
diff --git a/docs/github_pages/contributing.md b/docs/github_pages/contributing.md
new file mode 100644
index 000000000..6539768c4
--- /dev/null
+++ b/docs/github_pages/contributing.md
@@ -0,0 +1,10 @@
+---
+has_children: true
+has_toc: true
+nav_order: 4
+---
+
+# Contributing
+
+We welcome contributions - just send us a pull request!
+
diff --git a/docs/github_pages/contributing/release_process.md b/docs/github_pages/contributing/release_process.md
new file mode 100644
index 000000000..db21f60b4
--- /dev/null
+++ b/docs/github_pages/contributing/release_process.md
@@ -0,0 +1,85 @@
+---
+parent: Contributing
+nav_order: 1
+---
+
+# Release Process
+
+## Create a Changelog Entry
+
+Every release must have a changelog entry.
+The changelog entry should include:
+* A summary of the major accomplishments of the release.
+* A list of all the changes in the release.
+* A list of all the bugs fixed by the release.
+
+Contributions from new collaborators should be acknowledged in the changelog.
+
+## Create Git Annotated Tags and GitHub Releases
+
+Each release needs to have a Git annotated tag and a GitHub release for that tag.
+The changelog for the release should be used for the text of the GitHub release.
+
+## Update Compiler Explorer
+
+Thrust and CUB are bundled together on
+[Compiler Explorer](https://www.godbolt.org/) (CE) as libraries for the CUDA
+language. When releasing a new version of these projects, CE will need to be
+updated.
+
+There are two files in two repos that need to be updated:
+
+### libraries.yaml
+
+- Repo: https://github.com/compiler-explorer/infra
+- Path: bin/yaml/libraries.yaml
+
+This file tells CE how to pull in library files and defines which versions to
+fetch. Look for the `thrustcub:` section:
+
+```yaml
+    thrustcub:
+      type: github
+      method: clone_branch
+      repo: NVIDIA/thrust
+      check_file: dependencies/cub/cub/cub.cuh
+      targets:
+        - 1.9.9
+        - 1.9.10
+        - 1.9.10-1
+        - 1.10.0
+```
+
+Simply add the new version tag to list of `targets:`. This will check out the
+specified tag to `/opt/compiler-explorer/libs/thrustcub/<tag>/`.
+
+### cuda.amazon.properties
+
+- Repo: https://github.com/compiler-explorer/compiler-explorer
+- File: etc/config/cuda.amazon.properties
+
+This file defines the library versions displayed in the CE UI and maps them
+to a set of include directories. Look for the `libs.thrustcub` section:
+
+```yaml
+libs.thrustcub.name=Thrust+CUB
+libs.thrustcub.description=CUDA collective and parallel algorithms
+libs.thrustcub.versions=trunk:109090:109100:109101:110000
+libs.thrustcub.url=http://www.github.com/NVIDIA/thrust
+libs.thrustcub.versions.109090.version=1.9.9
+libs.thrustcub.versions.109090.path=/opt/compiler-explorer/libs/thrustcub/1.9.9:/opt/compiler-explorer/libs/thrustcub/1.9.9/dependencies/cub
+libs.thrustcub.versions.109100.version=1.9.10
+libs.thrustcub.versions.109100.path=/opt/compiler-explorer/libs/thrustcub/1.9.10:/opt/compiler-explorer/libs/thrustcub/1.9.10/dependencies/cub
+libs.thrustcub.versions.109101.version=1.9.10-1
+libs.thrustcub.versions.109101.path=/opt/compiler-explorer/libs/thrustcub/1.9.10-1:/opt/compiler-explorer/libs/thrustcub/1.9.10-1/dependencies/cub
+libs.thrustcub.versions.110000.version=1.10.0
+libs.thrustcub.versions.110000.path=/opt/compiler-explorer/libs/thrustcub/1.10.0:/opt/compiler-explorer/libs/thrustcub/1.10.0/dependencies/cub
+libs.thrustcub.versions.trunk.version=trunk
+libs.thrustcub.versions.trunk.path=/opt/compiler-explorer/libs/thrustcub/trunk:/opt/compiler-explorer/libs/thrustcub/trunk/dependencies/cub
+```
+
+Add a new version identifier to the `libs.thrustcub.versions` key, using the
+convention `X.Y.Z-W -> XXYYZZWW`. Then add a corresponding UI label (the
+`version` key) and set of colon-separated include paths for Thrust and CUB
+(`path`). The version used in the `path` entries must exactly match the tag
+specified in `libraries.yaml`.
diff --git a/docs/github_pages/contributing/submitting_a_pr.md b/docs/github_pages/contributing/submitting_a_pr.md
new file mode 100644
index 000000000..9c1757655
--- /dev/null
+++ b/docs/github_pages/contributing/submitting_a_pr.md
@@ -0,0 +1,295 @@
+---
+parent: Contributing
+nav_order: 0
+---
+
+# Submitting a PR
+
+Thrust uses Github to manage all open-source development, including bug
+tracking, pull requests, and design discussions. This document details how to get
+started as a Thrust contributor.
+
+An overview of this process is:
+
+1. [Clone the Thrust repository](#clone-the-thrust-repository)
+1. [Setup a fork of Thrust](#setup-a-fork-of-thrust)
+1. [Setup your environment](#setup-your-environment)
+1. [Create a development branch](#create-a-development-branch)
+1. [Local development loop](#local-development-loop)
+1. [Push development branch to your fork](#push-development-branch-to-your-fork)
+1. [Create pull request](#create-pull-request)
+1. [Address feedback and update pull request](#address-feedback-and-update-pull-request)
+1. [When your PR is approved...](#when-your-pr-is-approved)
+
+## Clone the Thrust Repository
+
+To get started, clone the main repository to your local computer. Thrust should
+be cloned recursively to setup the CUB submodule (required for `CUDA`
+acceleration).
+
+```
+git clone --recursive https://github.com/NVIDIA/thrust.git
+cd thrust
+```
+
+## Setup a Fork of Thrust
+
+You'll need a fork of Thrust on Github to create a pull request. To setup your
+fork:
+
+1. Create a Github account (if needed)
+2. Go to [the Thrust Github page](https://github.com/NVIDIA/thrust)
+3. Click "Fork" and follow any prompts that appear.
+
+Once your fork is created, setup a new remote repo in your local Thrust clone:
+
+```
+git remote add github-fork git@github.com:<GITHUB_USERNAME>/thrust.git
+```
+
+If you need to modify CUB, too, go to
+[the CUB Github page](https://github.com/NVIDIA/cub) and repeat this process.
+Create CUB's `github-fork` remote in the `thrust/dependencies/cub` submodule.
+
+## Setup Your Environment
+
+### Git Environment
+
+If you haven't already, this is a good time to tell git who you are. This
+information is used to fill out authorship information on your git commits.
+
+```
+git config --global user.name "John Doe"
+git config --global user.email johndoe@example.com
+```
+
+### Configure CMake builds
+
+Thrust uses [CMake](https://www.cmake.org) for its primary build system. To
+configure, build, and test your checkout of Thrust:
+
+```
+# Create build directory:
+mkdir build
+cd build
+
+# Configure -- use one of the following:
+cmake ..                                 # Command line interface
+cmake -DTHRUST_INCLUDE_CUB_CMAKE=ON ..   # Enables CUB development targets
+ccmake ..                # ncurses GUI (Linux only)
+cmake-gui                # Graphical UI, set source/build directories in the app
+
+# Build:
+cmake --build . -j <num jobs>   # invokes make (or ninja, etc)
+
+# Run tests and examples:
+ctest
+```
+
+See [CMake Options](../setup/cmake_options.md) for details on customizing the build. To
+enable CUB tests and examples, set the `THRUST_INCLUDE_CUB_CMAKE` option to
+`ON`. Additional CMake options for CUB are listed
+[here](https://github.com/NVIDIA/cub/blob/main/CONTRIBUTING.md#cmake-options).
+
+## Create a Development Branch
+
+All work should be done in a development branch (also called a "topic branch")
+and not directly in the `main` branch. This makes it easier to manage multiple
+in-progress patches at once, and provides a descriptive label for your patch
+as it passes through the review system.
+
+To create a new branch based on the current `main`:
+
+```
+# Checkout local main branch:
+cd /path/to/thrust/sources
+git checkout main
+
+# Sync local main branch with github:
+git pull
+
+# Create a new branch named `my_descriptive_branch_name` based on main:
+git checkout -b my_descriptive_branch_name
+
+# Verify that the branch has been created and is currently checked out:
+git branch
+```
+
+Thrust branch names should follow a particular pattern:
+
+- For new features, name the branch `feature/<name>`
+- For bugfixes associated with a github issue, use `bug/github/<bug-description>-<bug-id>`
+  - Internal nvidia and gitlab bugs should use `nvidia` or `gitlab` in place of
+    `github`.
+
+If you plan to work on CUB as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Local Development Loop
+
+### Edit, Build, Test, Repeat
+
+Once the topic branch is created, you're all set to start working on Thrust
+code. Make some changes, then build and test them:
+
+```
+# Implement changes:
+cd /path/to/thrust/sources
+emacs thrust/some_file.h # or whatever editor you prefer
+
+# Create / update a unit test for your changes:
+emacs testing/some_test.cu
+
+# Check that everything builds and tests pass:
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+```
+
+### Creating a Commit
+
+Once you're satisfied with your patch, commit your changes:
+
+#### Thrust-only Changes
+
+```
+# Manually add changed files and create a commit:
+cd /path/to/thrust
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+git gui
+```
+
+#### Thrust and CUB Changes
+
+```
+# Create CUB patch first:
+cd /path/to/thrust/dependencies/cub
+# Manually add changed files and create a commit:
+git add cub/some_file.cuh
+git commit
+
+# Create Thrust patch, including submodule update:
+cd /path/to/thrust/
+git add dependencies/cub # Updates submodule info
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit
+
+# Or, if possible, use git-gui to review your changes while building your patch:
+cd /path/to/thrust/dependencies/cub
+git gui
+cd /path/to/thrust
+git gui # Include dependencies/cub as part of your commit
+
+```
+
+#### Writing a Commit Message
+
+Your commit message will communicate the purpose and rationale behind your
+patch to other developers, and will be used to populate the initial description
+of your Github pull request.
+
+When writing a commit message, the following standard format should be used,
+since tools in the git ecosystem are designed to parse this correctly:
+
+```
+First line of commit message is a short summary (<80 char)
+<Second line left blank>
+Detailed description of change begins on third line. This portion can
+span multiple lines, try to manually wrap them at something reasonable.
+
+Blank lines can be used to separate multiple paragraphs in the description.
+
+If your patch is associated with another pull request or issue in the main
+Thrust repository, you should reference it with a `#` symbol, e.g.
+#1023 for issue 1023.
+
+For issues / pull requests in a different github repo, reference them using
+the full syntax, e.g. NVIDIA/cub#4 for issue 4 in the NVIDIA/cub repo.
+
+Markdown is recommended for formatting more detailed messages, as these will
+be nicely rendered on Github, etc.
+```
+
+## Push Development Branch to your Fork
+
+Once you've committed your changes to a local development branch, it's time to
+push them to your fork:
+
+```
+cd /path/to/thrust/checkout
+git checkout my_descriptive_branch_name # if not already checked out
+git push --set-upstream github-fork my_descriptive_branch_name
+```
+
+`--set-upstream github-fork` tells git that future pushes/pulls on this branch
+should target your `github-fork` remote by default.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule.
+
+## Create Pull Request
+
+To create a pull request for your freshly pushed branch, open your github fork
+in a browser by going to `https://www.github.com/<GITHUB_USERNAME>/thrust`. A
+prompt may automatically appear asking you to create a pull request if you've
+recently pushed a branch.
+
+If there's no prompt, go to "Code" > "Branches" and click the appropriate
+"New pull request" button for your branch.
+
+If you would like a specific developer to review your patch, feel free to
+request them as a reviewer at this time.
+
+The Thrust team will review your patch, test it on NVIDIA's internal CI, and
+provide feedback.
+
+
+If have CUB changes to commit as part of your patch, repeat this process with
+your CUB branch and fork.
+
+## Address Feedback and Update Pull Request
+
+If the reviewers request changes to your patch, use the following process to
+update the pull request:
+
+```
+# Make changes:
+cd /path/to/thrust/sources
+git checkout my_descriptive_branch_name
+emacs thrust/some_file.h
+emacs testing/some_test.cu
+
+# Build + test
+cd /path/to/thrust/build/directory
+cmake --build . -j <num jobs>
+ctest
+
+# Amend commit:
+cd /path/to/thrust/sources
+git add thrust/some_file.h
+git add testing/some_test.cu
+git commit --amend
+# Or
+git gui # Check the "Amend Last Commit" box
+
+# Update the branch on your fork:
+git push -f
+```
+
+At this point, the pull request should show your recent changes.
+
+If have CUB changes to commit as part of your patch, repeat this process in the
+`thrust/dependencies/cub` submodule, and be sure to include any CUB submodule
+updates as part of your commit.
+
+## When Your PR is Approved
+
+Once your pull request is approved by the Thrust team, no further action is
+needed from you. We will handle integrating it since we must coordinate changes
+to `main` with NVIDIA's internal perforce repository.
+
diff --git a/docs/github_pages/favicon.ico b/docs/github_pages/favicon.ico
new file mode 100644
index 000000000..424df8720
Binary files /dev/null and b/docs/github_pages/favicon.ico differ
diff --git a/docs/github_pages/releases.md b/docs/github_pages/releases.md
new file mode 100644
index 000000000..81a5f2f3d
--- /dev/null
+++ b/docs/github_pages/releases.md
@@ -0,0 +1,60 @@
+---
+has_children: true
+has_toc: true
+nav_order: 3
+---
+
+# Releases
+
+| Version         | Included In                               |
+|-----------------|-------------------------------------------|
+| 2.0.1           | CUDA Toolkit 12.0                         |
+| 2.0.0           | TBD                                       |
+| 1.17.2          | TBD                                       |
+| 1.17.1          | TBD                                       |
+| 1.17.0          | TBD                                       |
+| 1.16.0          | TBD                                       |
+| 1.15.0          | NVIDIA HPC SDK 22.1 & CUDA Toolkit 11.6   |
+| 1.14.0          | NVIDIA HPC SDK 21.9                       |
+| 1.13.1          | CUDA Toolkit 11.5                         |
+| 1.13.1          | CUDA Toolkit 11.5                         |
+| 1.13.0          | NVIDIA HPC SDK 21.7                       |
+| 1.12.1          | CUDA Toolkit 11.4                         |
+| 1.12.0          | NVIDIA HPC SDK 21.3                       |
+| 1.11.0          | CUDA Toolkit 11.3                         |
+| 1.10.0          | NVIDIA HPC SDK 20.9 & CUDA Toolkit 11.2   |
+| 1.9.10-1        | NVIDIA HPC SDK 20.7 & CUDA Toolkit 11.1   |
+| 1.9.10          | NVIDIA HPC SDK 20.5                       |
+| 1.9.9           | CUDA Toolkit 11.0                         |
+| 1.9.8-1         | NVIDIA HPC SDK 20.3                       |
+| 1.9.8           | CUDA Toolkit 11.0 Early Access            |
+| 1.9.7-1         | CUDA Toolkit 10.2 for Tegra               |
+| 1.9.7           | CUDA Toolkit 10.2                         |
+| 1.9.6-1         | NVIDIA HPC SDK 20.3                       |
+| 1.9.6           | CUDA Toolkit 10.1 Update 2                |
+| 1.9.5           | CUDA Toolkit 10.1 Update 1                |
+| 1.9.4           | CUDA Toolkit 10.1                         |
+| 1.9.3           | CUDA Toolkit 10.0                         |
+| 1.9.2           | CUDA Toolkit 9.2                          |
+| 1.9.1-2         | CUDA Toolkit 9.1                          |
+| 1.9.0-5         | CUDA Toolkit 9.0                          |
+| 1.8.3           | CUDA Toolkit 8.0                          |
+| 1.8.2           | CUDA Toolkit 7.5                          |
+| 1.8.1           | CUDA Toolkit 7.0                          |
+| 1.8.0           |                                           |
+| 1.7.2           | CUDA Toolkit 6.5                          |
+| 1.7.1           | CUDA Toolkit 6.0                          |
+| 1.7.0           | CUDA Toolkit 5.5                          |
+| 1.6.0           |                                           |
+| 1.5.3           | CUDA Toolkit 5.0                          |
+| 1.5.2           | CUDA Toolkit 4.2                          |
+| 1.5.1           | CUDA Toolkit 4.1                          |
+| 1.5.0           |                                           |
+| 1.4.0           | CUDA Toolkit 4.0                          |
+| 1.3.0           |                                           |
+| 1.2.1           |                                           |
+| 1.2.0           |                                           |
+| 1.1.1           |                                           |
+| 1.1.0           |                                           |
+| 1.0.0           |                                           |
+
diff --git a/docs/github_pages/releases/versioning.md b/docs/github_pages/releases/versioning.md
new file mode 100644
index 000000000..e5f0e8eb1
--- /dev/null
+++ b/docs/github_pages/releases/versioning.md
@@ -0,0 +1,71 @@
+---
+parent: Releases
+nav_order: 1
+---
+
+# Versioning
+
+Thrust has its own versioning system for releases, independent of the
+  versioning scheme of the NVIDIA HPC SDK or the CUDA Toolkit.
+
+Today, Thrust version numbers have a specific [semantic meaning](https://semver.org/).
+Releases prior to 1.10.0 largely, but not strictly, followed these semantic
+  meanings.
+
+The version number for a Thrust release uses the following format:
+  `MMM.mmm.ss-ppp`, where:
+
+* `THRUST_VERSION_MAJOR`/`MMM`: Major version, up to 3 decimal digits.
+  It is incremented when changes that are API-backwards-incompatible are made.
+* `THRUST_VERSION_MINOR`/`mmm`: Minor version, up to 3 decimal digits.
+  It is incremented when breaking API, ABI, or semantic changes are made.
+* `THRUST_VERSION_SUBMINOR`/`ss`: Subminor version, up to 2 decimal digits.
+  It is incremented when notable new features or bug fixes or features that are
+  API-backwards-compatible are made.
+* `THRUST_PATCH_NUMBER`/`ppp`: Patch number, up to 3 decimal digits.
+  This is no longer used and will be zero for all future releases.
+
+The `<thrust/version.h>` header defines `THRUST_*` macros for all of the
+  version components mentioned above.
+Additionally, a `THRUST_VERSION` macro is defined, which is an integer literal
+  containing all of the version components except for `THRUST_PATCH_NUMBER`.
+
+## Trunk Based Development
+
+Thrust uses [trunk based development](https://trunkbaseddevelopment.com).
+There is a single long-lived branch called `main`, which is public and the
+  "source of truth".
+All other branches are downstream from `main`.
+Engineers may create branches for feature development.
+Such branches always merge into `main`.
+There are no release branches.
+Releases are produced by taking a snapshot of `main` ("snapping").
+After a release has been snapped from `main`, it will never be changed.
+
+## Branches and Tags
+
+The following tag names are used in the Thrust project:
+
+* `nvhpc-X.Y`: the tag that directly corresponds to what has been
+  shipped in the NVIDIA HPC SDK release X.Y.
+* `cuda-X.Y`: the tag that directly corresponds to what has been shipped
+  in the CUDA Toolkit release X.Y.
+* `A.B.C`: the tag that directly corresponds to Thrust version A.B.C.
+* `A.B.C-rcN`: the tag that directly corresponds to Thrust version A.B.C
+  release candidate N.
+
+The following branch names are used in the Thrust project:
+
+* `main`: the "source of truth" development branch of Thrust.
+* `old-master`: the old "source of truth" branch, before unification of
+  public and internal repositories.
+* `feature/<name>`: feature branch for a feature under development.
+* `bug/<bug-system>/<bug-description>-<bug-id>`: bug fix branch, where
+  `bug-system` is `github` or `nvidia`.
+
+On the rare occasion that we cannot do work in the open, for example when
+  developing a change specific to an unreleased product, these branches may
+  exist on an internal NVIDIA GitLab instance instead of the public GitHub.
+By default, everything should be in the open on GitHub unless there is a strong
+  motivation for it to not be open.
+
diff --git a/docs/github_pages/setup.md b/docs/github_pages/setup.md
new file mode 100644
index 000000000..edbef2e5c
--- /dev/null
+++ b/docs/github_pages/setup.md
@@ -0,0 +1,7 @@
+---
+has_children: true
+has_toc: true
+nav_order: 1
+---
+
+# Setup
diff --git a/docs/github_pages/setup/cmake_options.md b/docs/github_pages/setup/cmake_options.md
new file mode 100644
index 000000000..b62faddeb
--- /dev/null
+++ b/docs/github_pages/setup/cmake_options.md
@@ -0,0 +1,139 @@
+---
+parent: Setup
+nav_order: 1
+---
+
+# CMake Options
+
+A Thrust build is configured using CMake options. These may be passed to CMake
+using
+
+```
+cmake -D<option_name>=<value> /path/to/thrust/sources
+```
+
+or configured interactively with the `ccmake` or `cmake-gui` interfaces.
+
+Thrust supports two build modes. By default, a single configuration is built
+that targets a specific host system, device system, and C++ dialect.
+When `THRUST_ENABLE_MULTICONFIG` is `ON`, multiple configurations
+targeting a variety of systems and dialects are generated.
+
+The CMake options are divided into these categories:
+
+1. [Generic CMake Options](#generic-cmake-options): Options applicable to all
+   Thrust builds.
+1. [Single Config CMake Options](#single-config-cmake-options) Options
+   applicable only when `THRUST_ENABLE_MULTICONFIG` is disabled.
+1. [Multi Config CMake Options](#multi-config-cmake-options) Options applicable
+   only when `THRUST_ENABLE_MULTICONFIG` is enabled.
+1. [CUDA Specific CMake Options](#cuda-specific-cmake-options) Options that
+   control CUDA compilation. Only available when one or more configurations
+   targets the CUDA system.
+1. [TBB Specific CMake Options](#tbb-specific-cmake-options) Options that
+   control TBB compilation. Only available when one or more configurations
+   targets the TBB system.
+
+## Generic CMake Options
+
+- `CMAKE_BUILD_TYPE={Release, Debug, RelWithDebInfo, MinSizeRel}`
+  - Standard CMake build option. Default: `RelWithDebInfo`
+- `THRUST_ENABLE_HEADER_TESTING={ON, OFF}`
+  - Whether to test compile public headers. Default is `ON`.
+- `THRUST_ENABLE_TESTING={ON, OFF}`
+  - Whether to build unit tests. Default is `ON`.
+- `THRUST_ENABLE_EXAMPLES={ON, OFF}`
+  - Whether to build examples. Default is `ON`.
+- `THRUST_ENABLE_MULTICONFIG={ON, OFF}`
+  - Toggles single-config and multi-config modes. Default is `OFF` (single config).
+- `THRUST_ENABLE_EXAMPLE_FILECHECK={ON, OFF}`
+  - Enable validation of example outputs using the LLVM FileCheck utility.
+    Default is `OFF`.
+- `THRUST_ENABLE_INSTALL_RULES={ON, OFF}`
+  - If true, installation rules will be generated for thrust. Default is `ON`.
+
+## Single Config CMake Options
+
+- `THRUST_HOST_SYSTEM={CPP, TBB, OMP}`
+  - Selects the host system. Default: `CPP`
+- `THRUST_DEVICE_SYSTEM={CUDA, TBB, OMP, CPP}`
+  - Selects the device system. Default: `CUDA`
+- `THRUST_CPP_DIALECT={11, 14, 17}`
+  - Selects the C++ standard dialect to use. Default is `14` (C++14).
+
+## Multi Config CMake Options
+
+- `THRUST_MULTICONFIG_ENABLE_DIALECT_CPPXX={ON, OFF}`
+  - Toggle whether a specific C++ dialect will be targeted.
+  - Possible values of `XX` are `{11, 14, 17}`.
+  - By default, only C++14 is enabled.
+- `THRUST_MULTICONFIG_ENABLE_SYSTEM_XXXX={ON, OFF}`
+  - Toggle whether a specific system will be targeted.
+  - Possible values of `XXXX` are `{CPP, CUDA, TBB, OMP}`
+  - By default, only `CPP` and `CUDA` are enabled.
+- `THRUST_MULTICONFIG_WORKLOAD={SMALL, MEDIUM, LARGE, FULL}`
+  - Restricts the host/device combinations that will be targeted.
+  - By default, the `SMALL` workload is used.
+  - The full cross product of `host x device` systems results in 12
+    configurations, some of which are more important than others.
+    This option can be used to prune some of the less important ones.
+  - `SMALL`: (3 configs) Minimal coverage and validation of each device system against the `CPP` host.
+  - `MEDIUM`: (6 configs) Cheap extended coverage.
+  - `LARGE`: (8 configs) Expensive extended coverage. Includes all useful build configurations.
+  - `FULL`: (12 configs) The complete cross product of all possible build configurations.
+
+| Config   | Workloads | Value      | Expense   | Note                         |
+|----------|-----------|------------|-----------|------------------------------|
+| CPP/CUDA | `F L M S` | Essential  | Expensive | Validates CUDA against CPP   |
+| CPP/OMP  | `F L M S` | Essential  | Cheap     | Validates OMP against CPP    |
+| CPP/TBB  | `F L M S` | Essential  | Cheap     | Validates TBB against CPP    |
+| CPP/CPP  | `F L M  ` | Important  | Cheap     | Tests CPP as device          |
+| OMP/OMP  | `F L M  ` | Important  | Cheap     | Tests OMP as host            |
+| TBB/TBB  | `F L M  ` | Important  | Cheap     | Tests TBB as host            |
+| TBB/CUDA | `F L    ` | Important  | Expensive | Validates TBB/CUDA interop   |
+| OMP/CUDA | `F L    ` | Important  | Expensive | Validates OMP/CUDA interop   |
+| TBB/OMP  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| OMP/TBB  | `F      ` | Not useful | Cheap     | Mixes CPU-parallel systems   |
+| TBB/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+| OMP/CPP  | `F      ` | Not Useful | Cheap     | Parallel host, serial device |
+
+## CUDA Specific CMake Options
+
+- `THRUST_INCLUDE_CUB_CMAKE={ON, OFF}`
+  - If enabled, the CUB project will be built as part of Thrust. Default is
+    `OFF`.
+  - This adds CUB tests, etc. Useful for working on both CUB and Thrust
+    simultaneously.
+  - CUB configurations will be generated for each C++ dialect targeted by
+    the current Thrust build.
+- `THRUST_INSTALL_CUB_HEADERS={ON, OFF}`
+  - If enabled, the CUB project's headers will be installed through Thrust's
+    installation rules. Default is `ON`.
+  - This option depends on `THRUST_ENABLE_INSTALL_RULES`.
+- `THRUST_ENABLE_COMPUTE_XX={ON, OFF}`
+  - Controls the targeted CUDA architecture(s)
+  - Multiple options may be selected when using NVCC as the CUDA compiler.
+  - Valid values of `XX` are:
+    `{35, 37, 50, 52, 53, 60, 61, 62, 70, 72, 75, 80}`
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_ENABLE_COMPUTE_FUTURE={ON, OFF}`
+  - If enabled, CUDA objects will target the most recent virtual architecture
+    in addition to the real architectures specified by the
+    `THRUST_ENABLE_COMPUTE_XX` options.
+  - Default value depends on `THRUST_DISABLE_ARCH_BY_DEFAULT`:
+- `THRUST_DISABLE_ARCH_BY_DEFAULT={ON, OFF}`
+  - When `ON`, all `THRUST_ENABLE_COMPUTE_*` options are initially `OFF`.
+  - Default: `OFF` (meaning all architectures are enabled by default)
+- `THRUST_ENABLE_TESTS_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building tests.
+    Default is `OFF`.
+- `THRUST_ENABLE_EXAMPLES_WITH_RDC={ON, OFF}`
+  - Whether to enable Relocatable Device Code when building examples.
+    Default is `OFF`.
+
+## TBB Specific CMake Options
+
+- `THRUST_TBB_ROOT=<path to tbb root>`
+  - When the TBB system is requested, set this to the root of the TBB installation
+    (e.g. the location of `lib/`, `bin/` and `include/` for the TBB libraries).
+
diff --git a/docs/github_pages/setup/requirements.md b/docs/github_pages/setup/requirements.md
new file mode 100644
index 000000000..9d5316456
--- /dev/null
+++ b/docs/github_pages/setup/requirements.md
@@ -0,0 +1,82 @@
+---
+parent: Setup
+nav_order: 0
+---
+
+# Requirements
+
+All requirements are applicable to the `main` branch on GitHub.
+For details on specific releases, please see the [CHANGELOG.md].
+
+## Usage Requirements
+
+To use the NVIDIA C++ Standard Library, you must meet the following
+  requirements.
+
+### System Software
+
+Thrust and CUB require either the [NVIDIA HPC SDK] or the [CUDA Toolkit].
+
+Releases of Thrust and CUB are only tested against the latest releases of NVHPC
+  and CUDA.
+It may be possible to use newer version of Thrust and CUB with an older NVHPC or
+  CUDA installation by using a Thrust and CUB release from GitHub, but please
+  be aware this is not officially supported.
+
+### C++ Dialects
+
+Thrust and CUB support the following C++ dialects:
+
+- C++11 (deprecated)
+- C++14
+- C++17
+
+### Compilers
+
+Thrust and CUB support the following compilers when used in conjunction with
+  NVCC:
+
+- NVCC (latest version)
+- NVC++ (latest version)
+- GCC 5+
+- Clang 7+
+- MSVC 2019+ (19.20/16.0/14.20)
+
+Unsupported versions may emit deprecation warnings, which can be
+  silenced by defining `THRUST_IGNORE_DEPRECATED_COMPILER` during compilation.
+
+### Device Architectures
+
+Thrust and CUB support all NVIDIA device architectures since SM 35.
+
+### Host Architectures
+
+Thrust and CUB support the following host architectures:
+
+- aarch64.
+- x86-64.
+- ppc64le.
+
+### Host Operating Systems
+
+Thrust and CUB support the following host operating systems:
+
+- Linux.
+- Windows.
+
+## Build and Test Requirements
+
+To build and test Thrust and CUB yourself, you will need the following in
+  addition to the above requirements:
+
+- [CMake].
+
+
+
+[changelog]: ./releases/changelog.md
+
+[NVIDIA HPC SDK]: https://developer.nvidia.com/hpc-sdk
+[CUDA Toolkit]: https://developer.nvidia.com/cuda-toolkit
+
+[CMake]: https://cmake.org
+
diff --git a/docs/serve_docs_locally.bash b/docs/serve_docs_locally.bash
new file mode 100755
index 000000000..f438795e4
--- /dev/null
+++ b/docs/serve_docs_locally.bash
@@ -0,0 +1,35 @@
+#! /usr/bin/env bash
+
+###############################################################################
+# Copyright (c) 2018-2021 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+SCRIPT_PATH=$(cd $(dirname ${0}); pwd -P)
+
+REPO_PATH=${SCRIPT_PATH}/..
+
+BUILD_DOCS_PATH=build_docs
+BUILD_GITHUB_PAGES_PATH=${BUILD_DOCS_PATH}/github_pages
+
+cd ${REPO_PATH}/${BUILD_GITHUB_PAGES_PATH}
+
+bundle install
+bundle exec jekyll serve \
+  --verbose              \
+  --incremental          \
+  --profile              \
+  --baseurl "/thrust"    \
+  ${@}
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 000000000..306ecb7a3
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,157 @@
+# Setup FileCheck if requested and available:
+option(THRUST_ENABLE_EXAMPLE_FILECHECK
+  "Check example output with the LLVM FileCheck utility."
+  OFF
+)
+set(filecheck_data_path "${Thrust_SOURCE_DIR}/internal/test")
+
+if (THRUST_ENABLE_EXAMPLE_FILECHECK)
+  # TODO this should go into a find module
+  find_program(THRUST_FILECHECK_EXECUTABLE
+    DOC "Path to the LLVM FileCheck utility."
+    NAMES
+      FileCheck
+      FileCheck-3.9
+      FileCheck-4.0
+      FileCheck-5.0
+      FileCheck-6.0
+      FileCheck-7
+      FileCheck-8
+      FileCheck-9
+  )
+
+  if (NOT THRUST_FILECHECK_EXECUTABLE)
+    message(FATAL_ERROR
+      "Could not find the LLVM FileCheck utility. Set THRUST_FILECHECK_EXECUTABLE manually, "
+      "or disable THRUST_ENABLE_EXAMPLE_FILECHECK."
+    )
+  endif()
+
+  execute_process(
+    COMMAND "${THRUST_FILECHECK_EXECUTABLE}" "${filecheck_data_path}/thrust.smoke.filecheck"
+    INPUT_FILE "${Thrust_SOURCE_DIR}/cmake/filecheck_smoke_test"
+    RESULT_VARIABLE exit_code
+  )
+
+  if (0 EQUAL exit_code)
+    message(STATUS "FileCheck enabled: ${THRUST_FILECHECK_EXECUTABLE}")
+  else()
+    message(FATAL_ERROR
+      "The current THRUST_FILECHECK_EXECUTABLE ('${THRUST_FILECHECK_EXECUTABLE}') "
+      "does not seem to be a valid FileCheck executable."
+    )
+  endif()
+endif()
+
+# Create meta targets that build all examples for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.examples)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_EXAMPLES_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+## thrust_add_example
+#
+# Add an example executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the example
+#   target. Useful for post-processing target information per-backend.
+# example_name: The name of the example minus "<config_prefix>.example." For
+#   instance, examples/vector.cu will be "vector", and examples/cuda/copy.cu
+#   would be "cuda.copy".
+# example_src: The source file that implements the example.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_example target_name_var example_name example_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_example_src "${example_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_example_src "${example_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(example_target ${config_prefix}.example.${example_name})
+  set(${target_name_var} ${example_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_meta_target ${config_prefix}.examples)
+  set(example_meta_target thrust.all.example.${example_name})
+
+  add_executable(${example_target} "${real_example_src}")
+  target_link_libraries(${example_target} ${thrust_target})
+  target_include_directories(${example_target} PRIVATE "${Thrust_SOURCE_DIR}/examples")
+  thrust_clone_target_properties(${example_target} ${thrust_target})
+  thrust_fix_clang_nvcc_build_for(${example_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${example_target})
+
+  # Meta target that builds examples with this name for all configurations:
+  if (NOT TARGET ${example_meta_target})
+    add_custom_target(${example_meta_target})
+  endif()
+  add_dependencies(${example_meta_target} ${example_target})
+
+  if ("CUDA" STREQUAL "${config_device}" AND
+      THRUST_ENABLE_EXAMPLES_WITH_RDC)
+    thrust_enable_rdc_for_cuda_target(${example_target})
+  endif()
+
+  if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    target_compile_definitions(${example_target} PRIVATE THRUST_EXAMPLE_DEVICE_SIDE)
+  endif()
+
+  # Get the name of FileCheck input by stripping out the config name.
+  # (e.g. "thrust.cpp.cuda.cpp14.example.xxx" -> "thrust.example.xxx.filecheck")
+  string(REPLACE "${config_prefix}" "thrust"
+    filecheck_reference_file
+    "${example_target}.filecheck"
+  )
+
+  add_test(NAME ${example_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DEXAMPLE_EXECUTABLE=$<TARGET_FILE:${example_target}>"
+    "-DFILECHECK_ENABLED=${THRUST_ENABLE_EXAMPLE_FILECHECK}"
+    "-DFILECHECK_EXECUTABLE=${THRUST_FILECHECK_EXECUTABLE}"
+    "-DREFERENCE_FILE=${filecheck_data_path}/${filecheck_reference_file}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunExample.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${example_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+endfunction()
+
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
+
+add_subdirectory(cmake)
+add_subdirectory(cuda)
diff --git a/examples/README b/examples/README.md
similarity index 56%
rename from examples/README
rename to examples/README.md
index aaa0b5489..8a43897bb 100644
--- a/examples/README
+++ b/examples/README.md
@@ -4,8 +4,4 @@ norm example.
   $ nvcc norm.cu -o norm
 
 These examples are also available online:
-  http://code.google.com/p/thrust/source/browse/#hg/examples
-
-For additional information refer to the Quick Start Guide:
-  http://code.google.com/p/thrust/wiki/QuickStartGuide
-
+  https://github.com/NVIDIA/thrust/tree/main/examples
diff --git a/examples/SConscript b/examples/SConscript
deleted file mode 100644
index 5203c2e15..000000000
--- a/examples/SConscript
+++ /dev/null
@@ -1,29 +0,0 @@
-import os
-
-Import('env')
-
-# create a clone of the environment so that we don't alter the parent
-my_env = env.Clone()
-
-# find all .cus & .cpps in the current directory
-sources = []
-directories = ['.']
-
-# find all .cus & .cpps in the current directory
-sources = []
-directories = ['.', my_env['device_backend']]
-extensions = ['.cu','.cpp']
-
-for dir in directories:
-  for ext in extensions:
-    regex = os.path.join(dir, '*' + ext)
-    sources.extend(my_env.Glob(regex))
-
-# compile examples
-for src in sources:
-  program = my_env.Program(src)
-  # add the program to the 'run_examples' alias
-  program_alias = my_env.Alias('run_examples', [program], program[0].abspath)
-  # always build the 'run_examples' target whether or not it needs it
-  my_env.AlwaysBuild(program_alias)
-
diff --git a/examples/arbitrary_transformation.cu b/examples/arbitrary_transformation.cu
index d1a15096f..be22c2e5a 100644
--- a/examples/arbitrary_transformation.cu
+++ b/examples/arbitrary_transformation.cu
@@ -3,6 +3,12 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <iostream>
 
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+#include <thrust/zip_function.h>
+#endif // >= C++11
+
 // This example shows how to implement an arbitrary transformation of
 // the form output[i] = F(first[i], second[i], third[i], ... ).
 // In this example, we use a function with 3 inputs and 1 output.
@@ -22,6 +28,10 @@
 //      D[i] = A[i] + B[i] * C[i];
 // by invoking arbitrary_functor() on each of the tuples using for_each.
 //
+// If we are using a functor that is not designed for zip iterators by taking a
+// tuple instead of individual arguments we can adapt this function using the
+// zip_function adaptor (C++11 only).
+//
 // Note that we could extend this example to implement functions with an
 // arbitrary number of input arguments by zipping more sequence together.
 // With the same approach we can have multiple *output* sequences, if we 
@@ -31,7 +41,7 @@
 //
 // The possibilities are endless! :)
 
-struct arbitrary_functor
+struct arbitrary_functor1
 {
     template <typename Tuple>
     __host__ __device__
@@ -42,6 +52,17 @@ struct arbitrary_functor
     }
 };
 
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+struct arbitrary_functor2
+{
+    __host__ __device__
+    void operator()(const float& a, const float& b, const float& c, float& d)
+    {
+        // D[i] = A[i] + B[i] * C[i];
+        d = a + b * c;
+    }
+};
+#endif // >= C++11
 
 int main(void)
 {
@@ -49,7 +70,7 @@ int main(void)
     thrust::device_vector<float> A(5);
     thrust::device_vector<float> B(5);
     thrust::device_vector<float> C(5);
-    thrust::device_vector<float> D(5);
+    thrust::device_vector<float> D1(5);
 
     // initialize input vectors
     A[0] = 3;  B[0] = 6;  C[0] = 2; 
@@ -59,12 +80,26 @@ int main(void)
     A[4] = 2;  B[4] = 8;  C[4] = 3; 
 
     // apply the transformation
-    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D.begin())),
-                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D.end())),
-                     arbitrary_functor());
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D1.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D1.end())),
+                     arbitrary_functor1());
+
+    // print the output
+    std::cout << "Tuple functor" << std::endl;
+    for(int i = 0; i < 5; i++)
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D1[i] << std::endl;
+
+    // apply the transformation using zip_function
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+    thrust::device_vector<float> D2(5);
+    thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin(), D2.begin())),
+                     thrust::make_zip_iterator(thrust::make_tuple(A.end(),   B.end(),   C.end(),   D2.end())),
+                     thrust::make_zip_function(arbitrary_functor2()));
 
     // print the output
+    std::cout << "N-ary functor" << std::endl;
     for(int i = 0; i < 5; i++)
-        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D[i] << std::endl;
+        std::cout << A[i] << " + " << B[i] << " * " << C[i] << " = " << D2[i] << std::endl;
+#endif // >= C++11
 }
 
diff --git a/examples/bounding_box.cu b/examples/bounding_box.cu
index baced76f6..cca71a45e 100644
--- a/examples/bounding_box.cu
+++ b/examples/bounding_box.cu
@@ -31,6 +31,15 @@ struct bbox
     : lower_left(point), upper_right(point)
   {}
 
+  // construct a box from a single point
+  __host__ __device__
+  bbox& operator=(const point2d &point)
+  {
+    lower_left = point;
+    upper_right = point;
+    return *this;
+  }
+
   // construct a box from a pair of points
   __host__ __device__
   bbox(const point2d &ll, const point2d &ur)
diff --git a/examples/cmake/CMakeLists.txt b/examples/cmake/CMakeLists.txt
new file mode 100644
index 000000000..25d2a2f95
--- /dev/null
+++ b/examples/cmake/CMakeLists.txt
@@ -0,0 +1,28 @@
+thrust_update_system_found_flags()
+
+set(extra_cmake_flags)
+
+# Need to pass these when testing NVC++.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(extra_cmake_flags
+    -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}"
+    -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}"
+  )
+endif()
+
+if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND)
+  # Do a basic check of the cmake/ThrustAddSubdir.cmake mechanism:
+  add_test(
+    NAME thrust.example.cmake.add_subdir
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -G "${CMAKE_GENERATOR}"
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/add_subdir"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/add_subdir"
+      -D "THRUST_ROOT=${Thrust_SOURCE_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+      ${extra_cmake_flags}
+  )
+endif()
diff --git a/examples/cmake/add_subdir/CMakeLists.txt b/examples/cmake/add_subdir/CMakeLists.txt
new file mode 100644
index 000000000..96283699f
--- /dev/null
+++ b/examples/cmake/add_subdir/CMakeLists.txt
@@ -0,0 +1,91 @@
+# This example demonstrates / tests adding thrust via a CMake add_subdirectory
+# call from a parent project.
+#
+# The variables THRUST_REQUIRED_SYSTEMS and THRUST_OPTIONAL_SYSTEMS must be
+# set prior to add_subdirectory(thrust), and afterwards the thrust_create_target
+# function may be used to create targets with the desired systems. See
+# NVIDIA/thrust/cmake/README.md for more details on thrust_create_target.
+
+cmake_minimum_required(VERSION 3.15)
+
+# Silence warnings about empty CUDA_ARCHITECTURES properties on example targets:
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+  cmake_policy(SET CMP0104 OLD)
+endif()
+
+project(ThrustAddSubDirExample CXX)
+
+# Add required Thrust systems to THRUST_REQUIRED_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# An error is emitted if the system is not found.
+set(THRUST_REQUIRED_SYSTEMS CPP)
+
+# Add optional Thrust systems to THRUST_OPTIONAL_SYSTEMS.
+# Options are: CPP, CUDA, TBB or OMP.
+# Multiple systems may be specified.
+# No error is emitted if not found.
+set(THRUST_OPTIONAL_SYSTEMS CUDA)
+
+# Use your project's checkout of Thrust here, for most cases
+# `add_subdirectory(thrust)` will be sufficient.
+add_subdirectory("${THRUST_ROOT}" thrust)
+
+# Create a thrust target that only uses the serial CPP backend.
+# See thrust/thrust/cmake/README.md for details and additional options:
+thrust_create_target(ThrustCPP HOST CPP DEVICE CPP)
+
+# Create an executable that uses the CPP-only thrust target:
+add_executable(ExecWithCPP dummy.cpp)
+target_link_libraries(ExecWithCPP ThrustCPP)
+
+# To test for optional systems, first call thrust_update_system_found_flags to
+# set the THRUST_${system}_FOUND flags in current scope.
+# Required due to CMake scoping rules.
+thrust_update_system_found_flags()
+
+# Create and use a Thrust target configured to use CUDA acceleration if CUDA
+# is available:
+if (THRUST_CUDA_FOUND)
+  enable_language(CUDA)
+  thrust_create_target(ThrustCUDA HOST CPP DEVICE CUDA)
+  add_executable(ExecWithCUDA dummy.cu)
+  target_link_libraries(ExecWithCUDA ThrustCUDA)
+endif()
+
+#
+# Validation
+#
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+assert_boolean(THRUST_CPP_FOUND TRUE)
+assert_boolean(THRUST_CUDA_FOUND TRUE)
+assert_boolean(THRUST_OMP_FOUND FALSE)
+assert_boolean(THRUST_TBB_FOUND FALSE)
+
+assert_target(ThrustCPP)
+assert_target(ThrustCUDA)
+assert_target(ExecWithCPP)
+assert_target(ExecWithCUDA)
+
+thrust_debug_target(ThrustCPP "")
+thrust_debug_target(ThrustCUDA "")
+thrust_debug_target(ExecWithCPP "")
+thrust_debug_target(ExecWithCUDA "")
diff --git a/examples/cmake/add_subdir/dummy.cpp b/examples/cmake/add_subdir/dummy.cpp
new file mode 100644
index 000000000..ad7b9435f
--- /dev/null
+++ b/examples/cmake/add_subdir/dummy.cpp
@@ -0,0 +1,32 @@
+#include <thrust/detail/config.h>
+
+#include <iostream>
+
+int main()
+{
+  std::cout << "Hello from Thrust version " << THRUST_VERSION << ":\n"
+
+            << "Host system: "
+#if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
+            << "CPP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_OMP
+            << "OMP\n"
+#elif THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_TBB
+            << "TBB\n"
+#else
+            << "Unknown\n"
+#endif
+
+            << "Device system: "
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP
+            << "CPP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+            << "CUDA\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_OMP
+            << "OMP\n";
+#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB
+            << "TBB\n";
+#else
+            << "Unknown\n";
+#endif
+}
diff --git a/examples/cmake/add_subdir/dummy.cu b/examples/cmake/add_subdir/dummy.cu
new file mode 100644
index 000000000..b5645fc3d
--- /dev/null
+++ b/examples/cmake/add_subdir/dummy.cu
@@ -0,0 +1 @@
+#include "dummy.cpp"
diff --git a/examples/constant_iterator.cu b/examples/constant_iterator.cu
index 66a76ce2f..7e579f93d 100644
--- a/examples/constant_iterator.cu
+++ b/examples/constant_iterator.cu
@@ -2,10 +2,9 @@
 #include <thrust/transform.h>
 #include <thrust/functional.h>
 #include <thrust/device_vector.h>
-
-// for printing
 #include <thrust/copy.h> 
 #include <iterator>
+#include <iostream>
 
 int main(void)
 {
diff --git a/examples/counting_iterator.cu b/examples/counting_iterator.cu
index 196940a4a..e090e9e5e 100644
--- a/examples/counting_iterator.cu
+++ b/examples/counting_iterator.cu
@@ -2,9 +2,8 @@
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/device_vector.h>
-
-// for printing
 #include <iterator>
+#include <iostream>
 
 int main(void)
 {
diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt
new file mode 100644
index 000000000..bd72c58c0
--- /dev/null
+++ b/examples/cuda/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB example_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(example_src IN LISTS example_srcs)
+    get_filename_component(example_name "${example_src}" NAME_WLE)
+    string(PREPEND example_name "cuda.")
+    thrust_add_example(example_target ${example_name} "${example_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/examples/cuda/async_reduce.cu b/examples/cuda/async_reduce.cu
index 36a49ae09..6e1584bcc 100644
--- a/examples/cuda/async_reduce.cu
+++ b/examples/cuda/async_reduce.cu
@@ -1,9 +1,10 @@
+#include <thrust/detail/config.h>
 #include <thrust/device_vector.h>
 #include <thrust/reduce.h>
 #include <thrust/system/cuda/execution_policy.h>
 #include <cassert>
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 #include <future>
 #endif
 
@@ -20,11 +21,13 @@
 // std::future to wait for the result of the reduction. This method requires a compiler which supports
 // C++11-capable language and library constructs.
 
+#ifdef THRUST_EXAMPLE_DEVICE_SIDE
 template<typename Iterator, typename T, typename BinaryOperation, typename Pointer>
 __global__ void reduce_kernel(Iterator first, Iterator last, T init, BinaryOperation binary_op, Pointer result)
 {
   *result = thrust::reduce(thrust::cuda::par, first, last, init, binary_op);
 }
+#endif
 
 int main()
 {
@@ -39,7 +42,11 @@ int main()
   cudaStreamCreate(&s);
 
   // launch a CUDA kernel with only 1 thread on our stream
+#ifdef THRUST_EXAMPLE_DEVICE_SIDE
   reduce_kernel<<<1,1,0,s>>>(data.begin(), data.end(), 0, thrust::plus<int>(), result.data());
+#else
+  result[0] = thrust::reduce(thrust::cuda::par, data.begin(), data.end(), 0, thrust::plus<int>());
+#endif
 
   // wait for the stream to finish
   cudaStreamSynchronize(s);
@@ -52,18 +59,18 @@ int main()
   // reset the result
   result[0] = 0;
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
   // method 2: use std::async to create asynchrony
 
   // copy all the algorithm parameters
-  auto begin     = data.begin();
-  auto end       = data.end();
-  auto init      = 0;
-  auto binary_op = thrust::plus<int>();
+  auto begin        = data.begin();
+  auto end          = data.end();
+  unsigned int init = 0;
+  auto binary_op    = thrust::plus<unsigned int>();
 
   // std::async captures the algorithm parameters by value
   // use std::launch::async to ensure the creation of a new thread
-  std::future<int> future_result = std::async(std::launch::async, [=]
+  std::future<unsigned int> future_result = std::async(std::launch::async, [=]
   {
     return thrust::reduce(begin, end, init, binary_op);
   });
diff --git a/examples/cuda/custom_temporary_allocation.cu b/examples/cuda/custom_temporary_allocation.cu
index 7253c8183..7bba0fa9e 100644
--- a/examples/cuda/custom_temporary_allocation.cu
+++ b/examples/cuda/custom_temporary_allocation.cu
@@ -6,153 +6,176 @@
 #include <thrust/pair.h>
 #include <cstdlib>
 #include <iostream>
+#include <sstream>
 #include <map>
 #include <cassert>
 
+// This example demonstrates how to control how Thrust allocates temporary
+// storage during algorithms such as thrust::sort. The idea will be to create a
+// simple cache of allocations to search when temporary storage is requested.
+// If a hit is found in the cache, we quickly return the cached allocation
+// instead of resorting to the more expensive thrust::cuda::malloc.
+
+// Note: Thrust now has its own caching allocator layer; if you just need a
+// caching allocator, you ought to use that. This example is still useful
+// as a demonstration of how to use a Thrust custom allocator.
 
-// This example demonstrates how to intercept calls to get_temporary_buffer
-// and return_temporary_buffer to control how Thrust allocates temporary storage
-// during algorithms such as thrust::sort. The idea will be to create a simple
-// cache of allocations to search when temporary storage is requested. If a hit
-// is found in the cache, we quickly return the cached allocation instead of
-// resorting to the more expensive thrust::cuda::malloc.
-//
 // Note: this implementation cached_allocator is not thread-safe. If multiple
 // (host) threads use the same cached_allocator then they should gain exclusive
 // access to the allocator before accessing its methods.
 
+struct not_my_pointer
+{
+  not_my_pointer(void* p)
+    : message()
+  {
+    std::stringstream s;
+    s << "Pointer `" << p << "` was not allocated by this allocator.";
+    message = s.str();
+  }
+
+  virtual ~not_my_pointer() {}
 
-// cached_allocator: a simple allocator for caching allocation requests
-class cached_allocator
+  virtual const char* what() const
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+// A simple allocator for caching cudaMalloc allocations.
+struct cached_allocator
 {
-  public:
-    // just allocate bytes
-    typedef char value_type;
+  typedef char value_type;
 
-    cached_allocator() {}
+  cached_allocator() {}
 
-    ~cached_allocator()
-    {
-      // free all allocations when cached_allocator goes out of scope
-      free_all();
-    }
+  ~cached_allocator()
+  {
+    free_all();
+  }
+
+  char *allocate(std::ptrdiff_t num_bytes)
+  {
+    std::cout << "cached_allocator::allocate(): num_bytes == "
+              << num_bytes
+              << std::endl;
+
+    char *result = 0;
 
-    char *allocate(std::ptrdiff_t num_bytes)
+    // Search the cache for a free block.
+    free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
+
+    if (free_block != free_blocks.end())
     {
-      char *result = 0;
+      std::cout << "cached_allocator::allocate(): found a free block"
+                << std::endl;
 
-      // search the cache for a free block
-      free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
+      result = free_block->second;
 
-      if(free_block != free_blocks.end())
+      // Erase from the `free_blocks` map.
+      free_blocks.erase(free_block);
+    }
+    else
+    {
+      // No allocation of the right size exists, so create a new one with
+      // `thrust::cuda::malloc`.
+      try
       {
-        std::cout << "cached_allocator::allocator(): found a hit" << std::endl;
+        std::cout << "cached_allocator::allocate(): allocating new block"
+                  << std::endl;
 
-        // get the pointer
-        result = free_block->second;
-
-        // erase from the free_blocks map
-        free_blocks.erase(free_block);
+        // Allocate memory and convert the resulting `thrust::cuda::pointer` to
+        // a raw pointer.
+        result = thrust::cuda::malloc<char>(num_bytes).get();
       }
-      else
+      catch (std::runtime_error&)
       {
-        // no allocation of the right size exists
-        // create a new one with cuda::malloc
-        // throw if cuda::malloc can't satisfy the request
-        try
-        {
-          std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl;
-
-          // allocate memory and convert cuda::pointer to raw pointer
-          result = thrust::cuda::malloc<char>(num_bytes).get();
-        }
-        catch(std::runtime_error &e)
-        {
-          throw;
-        }
+        throw;
       }
+    }
 
-      // insert the allocated pointer into the allocated_blocks map
-      allocated_blocks.insert(std::make_pair(result, num_bytes));
+    // Insert the allocated pointer into the `allocated_blocks` map.
+    allocated_blocks.insert(std::make_pair(result, num_bytes));
 
-      return result;
-    }
+    return result;
+  }
 
-    void deallocate(char *ptr, size_t n)
-    {
-      // erase the allocated block from the allocated blocks map
-      allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
-      std::ptrdiff_t num_bytes = iter->second;
-      allocated_blocks.erase(iter);
+  void deallocate(char *ptr, size_t)
+  {
+    std::cout << "cached_allocator::deallocate(): ptr == "
+              << reinterpret_cast<void*>(ptr) << std::endl;
 
-      // insert the block into the free blocks map
-      free_blocks.insert(std::make_pair(num_bytes, ptr));
-    }
+    // Erase the allocated block from the allocated blocks map.
+    allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
 
-  private:
-    typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
-    typedef std::map<char *, std::ptrdiff_t>     allocated_blocks_type;
+    if (iter == allocated_blocks.end())
+      throw not_my_pointer(reinterpret_cast<void*>(ptr));
 
-    free_blocks_type      free_blocks;
-    allocated_blocks_type allocated_blocks;
+    std::ptrdiff_t num_bytes = iter->second;
+    allocated_blocks.erase(iter);
 
-    void free_all()
-    {
-      std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl;
+    // Insert the block into the free blocks map.
+    free_blocks.insert(std::make_pair(num_bytes, ptr));
+  }
 
-      // deallocate all outstanding blocks in both lists
-      for(free_blocks_type::iterator i = free_blocks.begin();
-          i != free_blocks.end();
-          ++i)
-      {
-        // transform the pointer to cuda::pointer before calling cuda::free
-        thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
-      }
+private:
+  typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
+  typedef std::map<char*, std::ptrdiff_t>      allocated_blocks_type;
 
-      for(allocated_blocks_type::iterator i = allocated_blocks.begin();
-          i != allocated_blocks.end();
-          ++i)
-      {
-        // transform the pointer to cuda::pointer before calling cuda::free
-        thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
-      }
+  free_blocks_type      free_blocks;
+  allocated_blocks_type allocated_blocks;
+
+  void free_all()
+  {
+    std::cout << "cached_allocator::free_all()" << std::endl;
+
+    // Deallocate all outstanding blocks in both lists.
+    for ( free_blocks_type::iterator i = free_blocks.begin()
+        ; i != free_blocks.end()
+        ; ++i)
+    {
+      // Transform the pointer to cuda::pointer before calling cuda::free.
+      thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
     }
 
+    for( allocated_blocks_type::iterator i = allocated_blocks.begin()
+       ; i != allocated_blocks.end()
+       ; ++i)
+    {
+      // Transform the pointer to cuda::pointer before calling cuda::free.
+      thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
+    }
+  }
 };
 
-
 int main()
 {
-#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
-  std::cout << "This feature requires gcc >= 4.4" << std::endl;
-  return 0;
-#endif
-
-  size_t n = 1 << 22;
+  std::size_t num_elements = 32768;
 
-  thrust::host_vector<int> h_input(n);
+  thrust::host_vector<int> h_input(num_elements);
 
-  // generate random input
+  // Generate random input.
   thrust::generate(h_input.begin(), h_input.end(), rand);
 
   thrust::cuda::vector<int> d_input = h_input;
-  thrust::cuda::vector<int> d_result(n);
+  thrust::cuda::vector<int> d_result(num_elements);
 
-  size_t num_trials = 5;
+  std::size_t num_trials = 5;
 
-  // create a cached_allocator object
   cached_allocator alloc;
 
-  for(size_t i = 0; i < num_trials; ++i)
+  for (std::size_t i = 0; i < num_trials; ++i)
   {
-    // initialize data to sort
     d_result = d_input;
 
-    // pass alloc through cuda::par as the first parameter to sort
-    // to cause allocations to be handled by alloc during sort
+    // Pass alloc through cuda::par as the first parameter to sort
+    // to cause allocations to be handled by alloc during sort.
     thrust::sort(thrust::cuda::par(alloc), d_result.begin(), d_result.end());
 
-    // ensure the result is sorted
+    // Ensure the result is sorted.
     assert(thrust::is_sorted(d_result.begin(), d_result.end()));
   }
 
diff --git a/examples/cuda/explicit_cuda_stream.cu b/examples/cuda/explicit_cuda_stream.cu
new file mode 100644
index 000000000..303a14723
--- /dev/null
+++ b/examples/cuda/explicit_cuda_stream.cu
@@ -0,0 +1,80 @@
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h> // For thrust::device
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+#include <cuda_runtime.h>
+
+#include <iostream>
+
+// This example shows how to execute a Thrust device algorithm on an explicit
+// CUDA stream. The simple program below fills a vector with the numbers
+// [0, 1000) (thrust::sequence) and then performs a scan operation
+// (thrust::inclusive_scan) on them. Both algorithms are executed on the same
+// custom CUDA stream using the CUDA execution policies.
+//
+// Thrust provides two execution policies that accept CUDA streams that differ
+// in when/if they synchronize the stream:
+// 1. thrust::cuda::par.on(stream)
+//      - `stream` will *always* be synchronized before an algorithm returns.
+//      - This is the default `thrust::device` policy when compiling with the
+//        CUDA device backend.
+// 2. thrust::cuda::par_nosync.on(stream)
+//      - `stream` will only be synchronized when necessary for correctness
+//        (e.g., returning a result from `thrust::reduce`). This is a hint that
+//        may be ignored by an algorithm's implementation.
+
+int main()
+{
+  thrust::device_vector<int> d_vec(1000);
+
+  // Create the stream:
+  cudaStream_t custom_stream;
+  cudaError_t err = cudaStreamCreate(&custom_stream);
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error creating stream: " << cudaGetErrorString(err) << "\n";
+    return 1;
+  }
+
+  // Construct a new `nosync` execution policy with the custom stream
+  auto nosync_exec_policy = thrust::cuda::par_nosync.on(custom_stream);
+
+  // Fill the vector with sequential data.
+  // This will execute using the custom stream and the stream will *not* be
+  // synchronized before the function returns, meaning asynchronous work may
+  // still be executing after returning and the contents of `d_vec` are
+  // undefined. Synchronization is not needed here because the following
+  // `inclusive_scan` is executed on the same stream and is therefore guaranteed
+  // to be ordered after the `sequence`
+  thrust::sequence(nosync_exec_policy, d_vec.begin(), d_vec.end());
+
+  // Construct a new *synchronous* execution policy with the same custom stream
+  auto sync_exec_policy = thrust::cuda::par.on(custom_stream);
+
+  // Compute in-place inclusive sum scan of data in the vector.
+  // This also executes in the custom stream, but the execution policy ensures
+  // the stream is synchronized before the algorithm returns. This guarantees
+  // there is no pending asynchronous work and the contents of `d_vec` are
+  // immediately accessible.
+  thrust::inclusive_scan(sync_exec_policy,
+                         d_vec.cbegin(),
+                         d_vec.cend(),
+                         d_vec.begin());
+
+  // This access is only valid because the stream has been synchronized
+  int sum = d_vec.back();
+
+  // Free the stream:
+  err = cudaStreamDestroy(custom_stream);
+  if (err != cudaSuccess)
+  {
+    std::cerr << "Error destroying stream: " << cudaGetErrorString(err) << "\n";
+    return 1;
+  }
+
+  // Print the sum:
+  std::cout << "sum is " << sum << std::endl;
+
+  return 0;
+}
diff --git a/examples/cuda/fallback_allocator.cu b/examples/cuda/fallback_allocator.cu
deleted file mode 100644
index 9921722ff..000000000
--- a/examples/cuda/fallback_allocator.cu
+++ /dev/null
@@ -1,166 +0,0 @@
-#include <thrust/functional.h>
-#include <thrust/tabulate.h>
-#include <thrust/sort.h>
-#include <thrust/memory.h>
-#include <thrust/system/cuda/memory.h>
-
-#include <new> // for std::bad_alloc
-#include <iostream>
-
-// This example demonstrates how to implement a fallback for cudaMalloc
-// with a custom allocator. When cudaMalloc fails to allocate device memory
-// the fallback_allocator attempts to allocate pinned host memory and
-// then map the host buffer into the device address space. The
-// fallback_allocator enables the GPU to process data sets that are larger
-// than the device memory, albeit with a significantly reduced performance.
-
-
-// fallback_allocator is a memory allocator which uses pinned host memory as a functional fallback
-class fallback_allocator
-{
-  public:
-    // just allocate bytes
-    typedef char value_type;
-
-    // allocate's job to is allocate host memory as a functional fallback when cudaMalloc fails
-    char *allocate(std::ptrdiff_t n)
-    {
-      char *result = 0;
-
-      // attempt to allocate device memory
-      if(cudaMalloc(&result, n) == cudaSuccess)
-      {
-        std::cout << "  allocated " << n << " bytes of device memory" << std::endl;
-      }
-      else
-      {
-        // reset the last CUDA error
-        cudaGetLastError();
-
-        // attempt to allocate pinned host memory
-        void *h_ptr = 0;
-        if(cudaMallocHost(&h_ptr, n) == cudaSuccess)
-        {
-          // attempt to map host pointer into device memory space
-          if(cudaHostGetDevicePointer(&result, h_ptr, 0) == cudaSuccess)
-          {
-            std::cout << "  allocated " << n << " bytes of pinned host memory (fallback successful)" << std::endl;
-          }
-          else
-          {
-            // reset the last CUDA error
-            cudaGetLastError();
-
-            // attempt to deallocate buffer
-            std::cout << "  failed to map host memory into device address space (fallback failed)" << std::endl;
-            cudaFreeHost(h_ptr);
-
-            throw std::bad_alloc();
-          }
-        }
-        else
-        {
-          // reset the last CUDA error
-          cudaGetLastError();
-
-          std::cout << "  failed to allocate " << n << " bytes of memory (fallback failed)" << std::endl;
-
-          throw std::bad_alloc();
-        }
-      }
-
-      return result;
-    }
-
-    // deallocate's job to is inspect where the pointer lives and free it appropriately
-    void deallocate(char *ptr, size_t n)
-    {
-      void *raw_ptr = thrust::raw_pointer_cast(ptr);
-
-      // determine where memory resides
-      cudaPointerAttributes	attributes;
-
-      if(cudaPointerGetAttributes(&attributes, raw_ptr) == cudaSuccess)
-      {
-        // free the memory in the appropriate way
-        if(attributes.memoryType == cudaMemoryTypeHost)
-        {
-          cudaFreeHost(raw_ptr);
-        }
-        else
-        {
-          cudaFree(raw_ptr);
-        }
-      }
-    }
-};
-
-
-int main(void)
-{
-  // check whether device supports mapped host memory
-  int device;
-  cudaGetDevice(&device);
-  cudaDeviceProp properties;
-  cudaGetDeviceProperties(&properties, device);
-
-  fallback_allocator alloc;
-
-  // this example requires both unified addressing and memory mapping
-  if(!properties.unifiedAddressing || !properties.canMapHostMemory)
-  {
-    std::cout << "Device #" << device 
-              << " [" << properties.name << "] does not support memory mapping" << std::endl;
-    return 0;
-  }
-  else
-  {
-    std::cout << "Testing fallback_allocator on device #" << device 
-              << " [" << properties.name << "] with " 
-              << properties.totalGlobalMem << " bytes of device memory" << std::endl;
-  }
-
-  try
-  {
-    size_t one_million = 1 << 20;
-    size_t one_billion = 1 << 30;
-
-    for(size_t n = one_million; n < one_billion; n *= 2)
-    {
-      // TODO ideally we'd use the fallback_allocator in the vector too
-      //thrust::cuda::vector<int, fallback_allocator> d_vec(n);
-
-      std::cout << "attempting to sort " << n << " values" << std::endl;
-
-      // use our special malloc to allocate
-      int *raw_ptr = reinterpret_cast<int*>(alloc.allocate(n * sizeof(int)));
-
-      thrust::cuda::pointer<int> begin = thrust::cuda::pointer<int>(raw_ptr);
-      thrust::cuda::pointer<int> end   = begin + n;
-
-      // generate unsorted values
-      thrust::tabulate(begin, end, thrust::placeholders::_1 % 1024);
-
-      // sort the data using our special allocator
-      // if temporary memory is required during the sort,
-      // our allocator will be called
-      try
-      {
-        thrust::sort(thrust::cuda::par(alloc), begin, end);
-      }
-      catch(std::bad_alloc)
-      {
-        std::cout << "  caught std::bad_alloc from thrust::sort" << std::endl;
-      }
-
-      alloc.deallocate(reinterpret_cast<char*>(raw_ptr), n * sizeof(int));
-    }
-  }
-  catch(std::bad_alloc)
-  {
-    std::cout << "caught std::bad_alloc from malloc" << std::endl;
-  }
-
-  return 0;
-}
-
diff --git a/examples/cuda/global_device_vector.cu b/examples/cuda/global_device_vector.cu
new file mode 100644
index 000000000..a99566796
--- /dev/null
+++ b/examples/cuda/global_device_vector.cu
@@ -0,0 +1,46 @@
+#include <thrust/detail/config.h>
+#include <thrust/device_vector.h>
+
+// If you create a global `thrust::device_vector` with the default allocator,
+// you'll get an error during program termination when the memory of the vector
+// is freed, as the CUDA runtime cannot be used during program termination.
+//
+// To get around this, you can create your own allocator which ignores
+// deallocation failures that occur because the CUDA runtime is shut down.
+
+extern "C" cudaError_t cudaFreeIgnoreShutdown(void* ptr) {
+  cudaError_t const err = cudaFree(ptr);
+  if (cudaSuccess == err || cudaErrorCudartUnloading == err)
+    return cudaSuccess;
+  return err; 
+}
+
+typedef thrust::system::cuda::detail::cuda_memory_resource<
+  cudaMalloc, 
+  cudaFreeIgnoreShutdown,
+  thrust::cuda::pointer<void>
+> device_ignore_shutdown_memory_resource;
+
+#if THRUST_CPP_DIALECT >= 2011
+  template <typename T>
+  using device_ignore_shutdown_allocator = 
+    thrust::mr::stateless_resource_allocator<
+      T,
+      thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
+    >;
+    
+  thrust::device_vector<double, device_ignore_shutdown_allocator<double>> d;
+#else
+  thrust::device_vector<
+    double, 
+    thrust::mr::stateless_resource_allocator<
+      double,
+      thrust::device_ptr_memory_resource<device_ignore_shutdown_memory_resource>
+    > 
+  > d;
+#endif
+
+int main() {
+  d.resize(25);
+}
+
diff --git a/examples/cuda/range_view.cu b/examples/cuda/range_view.cu
index a5a86ba76..2ede62047 100644
--- a/examples/cuda/range_view.cu
+++ b/examples/cuda/range_view.cu
@@ -12,11 +12,6 @@
 // access that data from a device function. Even though device_vectors are not
 // accessible from device functions, the range_view class allows us to access
 // and manipulate its data as if we were manipulating a real container.
-//
-
-// This example demonstrate use of range_view with for_each algorithm which is
-// dispatch from GPU
-//
 
 template<class Iterator>
 class range_view
@@ -193,13 +188,6 @@ void saxpy(float A, View1 X, View2 Y, View3 Z)
       saxpy_functor<View1,View2,View3>(A,X,Y,Z));
 }
 
-template<class View1, class View2, class View3>
-__global__
-void saxpy_kernel(float A, View1 X, View2 Y, View3 Z)
-{
-  saxpy(A, X, Y, Z);
-}
-
 struct f1 : public thrust::unary_function<float,float>
 {
   __host__ __device__
@@ -209,7 +197,7 @@ struct f1 : public thrust::unary_function<float,float>
   }
 };
 
-int main(int argc, char* argv[])
+int main()
 {
   using std::cout;
   using std::endl;
@@ -223,7 +211,7 @@ int main(int argc, char* argv[])
   thrust::device_vector<float> Y(y, y + 4);
   thrust::device_vector<float> Z(z, z + 4);
 
-  saxpy_kernel<<<1, 1>>>(
+  saxpy(
       2.0, 
 
       // make a range view of a pair of transform_iterators
@@ -235,11 +223,10 @@ int main(int argc, char* argv[])
 
       // range view of naked pointers
       make_range_view(Z.data().get(), 4));
-  assert(cudaSuccess == cudaDeviceSynchronize());
 
   // print values from original device_vector<float> Z 
   // to ensure that range view was mapped to this vector
-  for (int i = 0, n = Z.size(); i < n; ++i)
+  for (std::size_t i = 0, n = Z.size(); i < n; ++i)
   {
     cout << "z[" << i << "]= " << Z[i] << endl;
   }
diff --git a/examples/cuda/simple_cuda_streams.cu b/examples/cuda/simple_cuda_streams.cu
deleted file mode 100644
index e165fbef3..000000000
--- a/examples/cuda/simple_cuda_streams.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <thrust/device_vector.h>
-#include <thrust/for_each.h>
-#include <thrust/system/cuda/execution_policy.h>
-#include <cstdio>
-#include <cassert>
-
-// This example demonstrates how to achieve asynchronous, concurrent algorithm execution using
-// the CUDA backend's low-level stream-based interface. This program uses thrust::for_each to invoke
-// two functors, "ping", and "pong", which communicate via a shared variable, "ball". To encourage
-// concurrency, we execute thrust::for_each on two independent CUDA streams using the thrust::cuda::par
-// execution policy.
-//
-// Note that stream usage provides no guarantee of concurrency. If the ping and pong functions
-// do not happen to be scheduled concurrently, this program will deadlock.
-
-struct ping
-{
-  // XXX nvcc issue prevents us from making ball volatile
-  //__device__
-  //void operator()(volatile int &ball)
-  __device__
-  void operator()(int &ball)
-  {
-    // we're not guaranteed concurrency, so only attempt this 1000 times
-    unsigned int attempt = 0;
-
-    ball = 1;
-
-    for(unsigned int next_state = 2;
-        next_state < 25 && attempt < 1000;
-        next_state += 2)
-    {
-      while(ball != next_state && attempt < 1000)
-      {
-#if __CUDA_ARCH__ >= 200
-        printf("ping waiting for return\n");
-#endif
-        ++attempt;
-      }
-
-      ball += 1;
-
-#if __CUDA_ARCH__ >= 200
-      printf("ping! ball is now %d\n", next_state + 1);
-#endif
-    }
-  }
-};
-
-struct pong
-{
-  // XXX nvcc issue prevents us from making ball volatile
-  //__device__
-  //void operator()(volatile int &ball)
-  __device__
-  void operator()(int &ball)
-  {
-    // we're not guaranteed concurrency, so only attempt this 1000 times
-    unsigned int attempt = 0;
-
-    for(unsigned int next_state = 1;
-        next_state < 25 && attempt < 1000;
-        next_state += 2)
-    {
-      while(ball != next_state && attempt < 1000)
-      {
-#if __CUDA_ARCH__ >= 200
-        printf("pong waiting for return\n");
-#endif
-        ++attempt;
-      }
-
-      ball += 1;
-
-#if __CUDA_ARCH__ >= 200
-      printf("pong! ball is now %d\n", next_state + 1);
-#endif
-    }
-  }
-};
-
-int main()
-{
-  cudaStream_t s1, s2;
-  cudaStreamCreate(&s1);
-  cudaStreamCreate(&s2);
-
-  thrust::device_vector<int> ball(1);
-
-  // Invoke thrust::for_each with the thrust::cuda::par
-  // execution policy. Pass the stream s1 as an argument
-  // to the .on() function
-  thrust::for_each(thrust::cuda::par.on(s1),
-                   ball.begin(),
-                   ball.end(),
-                   ping());
-
-  // Invoke thrust::for_each with the thrust::cuda::par
-  // execution policy. Pass the stream s2 as an argument
-  // to the .on() function
-  thrust::for_each(thrust::cuda::par.on(s2),
-                   ball.begin(),
-                   ball.end(),
-                   pong());
-
-  // Wait for all algorithms executed on the streams to terminate.
-  cudaStreamSynchronize(s1);
-  cudaStreamSynchronize(s2);
-
-  cudaStreamDestroy(s1);
-  cudaStreamDestroy(s2);
-
-  return 0;
-}
-
-
diff --git a/examples/device_ptr.cu b/examples/device_ptr.cu
index 04ae90fea..0074a0250 100644
--- a/examples/device_ptr.cu
+++ b/examples/device_ptr.cu
@@ -6,6 +6,7 @@
 #include <thrust/reduce.h>
 
 #include <cassert>
+#include <iostream>
 
 int main(void)
 {
@@ -36,6 +37,7 @@ int main(void)
 
   // back to where we started
   assert(wrapped_ptr == d_ptr);
+  (void)wrapped_ptr; // for when NDEBUG is defined
 
   // deallocate device memory
   thrust::device_free(d_ptr);
diff --git a/examples/discrete_voronoi.cu b/examples/discrete_voronoi.cu
index 93e7e5622..bfbf2242d 100644
--- a/examples/discrete_voronoi.cu
+++ b/examples/discrete_voronoi.cu
@@ -4,10 +4,10 @@
 #include <thrust/extrema.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
-#include <iostream>
 
+#include <iostream>
 #include <iomanip>
-#include <stdio.h>
+#include <fstream>
 #include <cmath>
 
 #include "include/timer.h"
@@ -135,21 +135,26 @@ void generate_random_sites(thrust::host_vector<int> &t, int Nb, int m, int n)
 //Export the tab to PGM image format
 void vector_to_pgm(thrust::host_vector<int> &t, int m, int n, const char *out)
 {
-    FILE *f;
+    assert(static_cast<int>(t.size()) == m * n &&
+           "Vector size does not match image dims.");
 
-    f=fopen(out,"w+t");
-    fprintf(f,"P2\n");
-    fprintf(f,"%d %d\n 253\n",m,n);
+    std::fstream f(out, std::fstream::out);
+    f << "P2\n";
+    f << m << " " << n << "\n";
+    f << "253\n";
+
+    //Hash function to map values to [0,255]
+    auto to_grey_level = [](int in_value) -> int
+    {
+        return (71 * in_value) % 253;
+    };
 
-    for(int j = 0; j < n ; j++)
+    for (int value : t)
     {
-        for(int i = 0; i < m ; i++)
-        {
-            fprintf(f,"%d ",(int)(71*t[j*m+i])%253); //Hash function to map values to [0,255]
-        }
+      f << to_grey_level(value) << " ";
     }
-    fprintf(f,"\n");
-    fclose(f);
+    f << "\n";
+    f.close();
 }
 
 /************Main Jfa loop********************/
diff --git a/examples/dot_products_with_zip.cu b/examples/dot_products_with_zip.cu
index 52e33d8e6..81ff7ac12 100644
--- a/examples/dot_products_with_zip.cu
+++ b/examples/dot_products_with_zip.cu
@@ -6,9 +6,9 @@
 #include <thrust/random.h>
 
 
-// This example shows how thrust::zip_iterator can be used to create a 
-// 'virtual' array of structures.  In this case the structure is a 3d 
-// vector type (Float3) whose (x,y,z) components will be stored in 
+// This example shows how thrust::zip_iterator can be used to create a
+// 'virtual' array of structures.  In this case the structure is a 3d
+// vector type (Float3) whose (x,y,z) components will be stored in
 // three separate float arrays.  The zip_iterator "zips" these arrays
 // into a single virtual Float3 array.
 
@@ -54,17 +54,17 @@ int main(void)
     // We'll store the components of the 3d vectors in separate arrays. One set of
     // arrays will store the 'A' vectors and another set will store the 'B' vectors.
 
-    // This 'structure of arrays' (SoA) approach is usually more efficient than the 
+    // This 'structure of arrays' (SoA) approach is usually more efficient than the
     // 'array of structures' (AoS) approach.  The primary reason is that structures,
     // like Float3, don't always obey the memory coalescing rules, so they are not
     // efficiently transferred to and from memory.  Another reason to prefer SoA to
     // AoS is that we don't aways want to process all members of the structure.  For
-    // example, if we only need to look at first element of the structure then it 
+    // example, if we only need to look at first element of the structure then it
     // is wasteful to load the entire structure from memory.  With the SoA approach,
     // we can chose which elements of the structure we wish to read.
 
     thrust::device_vector<float> A0 = random_vector(N);  // x components of the 'A' vectors
-    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors 
+    thrust::device_vector<float> A1 = random_vector(N);  // y components of the 'A' vectors
     thrust::device_vector<float> A2 = random_vector(N);  // z components of the 'A' vectors
 
     thrust::device_vector<float> B0 = random_vector(N);  // x components of the 'B' vectors
@@ -78,7 +78,7 @@ int main(void)
     // We'll now illustrate two ways to use zip_iterator to compute the dot
     // products.  The first method is verbose but shows how the parts fit together.
     // The second method hides these details and is more concise.
-   
+
 
     // METHOD #1
     // Defining a zip_iterator type can be a little cumbersome ...
@@ -87,24 +87,24 @@ int main(void)
     typedef thrust::zip_iterator<FloatIteratorTuple>                   Float3Iterator;
 
     // Now we'll create some zip_iterators for A and B
-    Float3Iterator A_first = thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin()));
-    Float3Iterator A_last  = thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end()));
-    Float3Iterator B_first = thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin()));
-                            
+    Float3Iterator A_first = thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin()));
+    Float3Iterator A_last  = thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end()));
+    Float3Iterator B_first = thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin()));
+
     // Finally, we pass the zip_iterators into transform() as if they
     // were 'normal' iterators for a device_vector<Float3>.
     thrust::transform(A_first, A_last, B_first, result.begin(), DotProduct());
 
 
     // METHOD #2
-    // Alternatively, we can avoid creating variables for X_first, X_last, 
+    // Alternatively, we can avoid creating variables for X_first, X_last,
     // and Y_first and invoke transform() directly.
-    thrust::transform( thrust::make_zip_iterator(make_tuple(A0.begin(), A1.begin(), A2.begin())),
-                       thrust::make_zip_iterator(make_tuple(A0.end(),   A1.end(),   A2.end())),
-                       thrust::make_zip_iterator(make_tuple(B0.begin(), B1.begin(), B2.begin())),
+    thrust::transform( thrust::make_zip_iterator(thrust::make_tuple(A0.begin(), A1.begin(), A2.begin())),
+                       thrust::make_zip_iterator(thrust::make_tuple(A0.end(),   A1.end(),   A2.end())),
+                       thrust::make_zip_iterator(thrust::make_tuple(B0.begin(), B1.begin(), B2.begin())),
                        result.begin(),
                        DotProduct() );
-    
+
 
 
     // Finally, we'll print a few results
@@ -126,8 +126,8 @@ int main(void)
         std::cout << "(" << thrust::get<0>(b) << "," << thrust::get<1>(b) << "," << thrust::get<2>(b) << ")";
         std::cout << " = ";
         std::cout << dot << std::endl;
-    }   
+    }
 
     return 0;
 }
- 
+
diff --git a/examples/expand.cu b/examples/expand.cu
index 4547bcd13..f61edec8f 100644
--- a/examples/expand.cu
+++ b/examples/expand.cu
@@ -51,7 +51,6 @@ OutputIterator expand(InputIterator1 first1,
      thrust::maximum<difference_type>());
 
   // gather input values according to index array (output = first2[output_indices])
-  OutputIterator output_end = output; thrust::advance(output_end, output_size);
   thrust::gather(output_indices.begin(),
                  output_indices.end(),
                  first2,
diff --git a/examples/lambda.cu b/examples/lambda.cu
index b2cb4a9fb..65b75f627 100644
--- a/examples/lambda.cu
+++ b/examples/lambda.cu
@@ -1,6 +1,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
+#include <iostream>
 
 // This example demonstrates the use of placeholders to implement
 // the SAXPY operation (i.e. Y[i] = a * X[i] + Y[i]).
diff --git a/examples/max_abs_diff.cu b/examples/max_abs_diff.cu
index 93ec06db3..c9ae4d337 100644
--- a/examples/max_abs_diff.cu
+++ b/examples/max_abs_diff.cu
@@ -14,7 +14,7 @@ struct abs_diff : public thrust::binary_function<T,T,T>
     __host__ __device__
     T operator()(const T& a, const T& b)
     {
-        return std::fabs(b - a);
+        return fabsf(b - a);
     }
 };
 
diff --git a/examples/monte_carlo_disjoint_sequences.cu b/examples/monte_carlo_disjoint_sequences.cu
index ed804268e..77b0d0086 100644
--- a/examples/monte_carlo_disjoint_sequences.cu
+++ b/examples/monte_carlo_disjoint_sequences.cu
@@ -51,7 +51,7 @@ struct estimate_pi : public thrust::unary_function<unsigned int,float>
       float y = u01(rng);
 
       // measure distance from the origin
-      float dist = std::sqrt(x*x + y*y);
+      float dist = sqrtf(x*x + y*y);
 
       // add 1.0f if (u0,u1) is inside the quarter circle
       if(dist <= 1.0f)
diff --git a/examples/mr_basic.cu b/examples/mr_basic.cu
new file mode 100644
index 000000000..733799425
--- /dev/null
+++ b/examples/mr_basic.cu
@@ -0,0 +1,82 @@
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/pool.h>
+#include <thrust/mr/disjoint_pool.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_ptr.h>
+
+#include <cassert>
+
+template<typename Vec>
+void do_stuff_with_vector(typename Vec::allocator_type alloc)
+{
+    Vec v1(alloc);
+    v1.push_back(1);
+    assert(v1.back() == 1);
+
+    Vec v2(alloc);
+    v2 = v1;
+
+    v1.swap(v2);
+
+    v1.clear();
+    v1.resize(2);
+    assert(v1.size() == 2);
+}
+
+int main()
+{
+    thrust::mr::new_delete_resource memres;
+
+    {
+        // no virtual calls will be issued
+        typedef thrust::mr::allocator<int, thrust::mr::new_delete_resource> Alloc;
+        Alloc alloc(&memres);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    {
+        // virtual calls will be issued - wrapping in a polymorphic wrapper
+        thrust::mr::polymorphic_adaptor_resource<void *> adaptor(&memres);
+        typedef thrust::mr::polymorphic_allocator<int, void *> Alloc;
+        Alloc alloc(&adaptor);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    {
+        // use the global device_ptr-flavored device memory resource
+        typedef thrust::device_ptr_memory_resource<thrust::device_memory_resource> Resource;
+        thrust::mr::polymorphic_adaptor_resource<thrust::device_ptr<void> > adaptor(
+            thrust::mr::get_global_resource<Resource>()
+        );
+        typedef thrust::mr::polymorphic_allocator<int, thrust::device_ptr<void> > Alloc;
+        Alloc alloc(&adaptor);
+
+        do_stuff_with_vector<thrust::device_vector<int, Alloc> >(alloc);
+    }
+
+    typedef thrust::mr::unsynchronized_pool_resource<
+        thrust::mr::new_delete_resource
+    > Pool;
+    Pool pool(&memres);
+    {
+        typedef thrust::mr::allocator<int, Pool> Alloc;
+        Alloc alloc(&pool);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+
+    typedef thrust::mr::disjoint_unsynchronized_pool_resource<
+        thrust::mr::new_delete_resource,
+        thrust::mr::new_delete_resource
+    > DisjointPool;
+    DisjointPool disjoint_pool(&memres, &memres);
+    {
+        typedef thrust::mr::allocator<int, DisjointPool> Alloc;
+        Alloc alloc(&disjoint_pool);
+
+        do_stuff_with_vector<thrust::host_vector<int, Alloc> >(alloc);
+    }
+}
diff --git a/examples/norm.cu b/examples/norm.cu
index f8723dfbf..0892baaf9 100644
--- a/examples/norm.cu
+++ b/examples/norm.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <cmath>
+#include <iostream>
 
 //   This example computes the norm [1] of a vector.  The norm is 
 // computed by squaring all numbers in the vector, summing the 
diff --git a/examples/permutation_iterator.cu b/examples/permutation_iterator.cu
index 5ff52f564..793c8aa12 100644
--- a/examples/permutation_iterator.cu
+++ b/examples/permutation_iterator.cu
@@ -1,6 +1,7 @@
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/reduce.h>
 #include <thrust/device_vector.h>
+#include <iostream>
 
 // this example fuses a gather operation with a reduction for
 // greater efficiency than separate gather() and reduce() calls
diff --git a/examples/raw_reference_cast.cu b/examples/raw_reference_cast.cu
index 440d98338..d6c854590 100644
--- a/examples/raw_reference_cast.cu
+++ b/examples/raw_reference_cast.cu
@@ -1,8 +1,8 @@
 #include <thrust/detail/raw_reference_cast.h>
-
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 #include <thrust/fill.h>
+#include <iostream>
 
 // This example illustrates how to use the raw_reference_cast to convert
 // system-specific reference wrappers into native references.
@@ -84,11 +84,9 @@ int main(void)
   typedef Vector::iterator           Iterator;
   typedef thrust::device_system_tag  System;
 
-  size_t N = 5;
-
   // allocate device memory
-  Vector A(N);
-  Vector B(N);
+  Vector A(5);
+  Vector B(5);
 
   // initialize A and B
   thrust::sequence(A.begin(), A.end());
@@ -100,7 +98,7 @@ int main(void)
 
   // note: we must specify the System to ensure correct execution
   thrust::for_each(thrust::counting_iterator<int,System>(0),
-                   thrust::counting_iterator<int,System>(N),
+                   thrust::counting_iterator<int,System>(5),
                    copy_iterators<Iterator,Iterator>(A.begin(), B.begin()));
   
   std::cout << "After A->B Copy" << std::endl;
diff --git a/examples/repeated_range.cu b/examples/repeated_range.cu
index 64d50077c..a309b80a6 100644
--- a/examples/repeated_range.cu
+++ b/examples/repeated_range.cu
@@ -2,13 +2,10 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/functional.h>
-
 #include <thrust/fill.h>
 #include <thrust/device_vector.h>
-
-// for printing
 #include <thrust/copy.h>
-#include <ostream>
+#include <iostream>
 
 // this example illustrates how to make repeated access to a range of values
 // examples:
diff --git a/examples/scan_by_key.cu b/examples/scan_by_key.cu
index 2eba55081..f353da556 100644
--- a/examples/scan_by_key.cu
+++ b/examples/scan_by_key.cu
@@ -10,7 +10,7 @@ struct head_flag_predicate
     : public thrust::binary_function<HeadFlagType,HeadFlagType,bool>
 {
     __host__ __device__
-    bool operator()(HeadFlagType left, HeadFlagType right) const
+    bool operator()(HeadFlagType, HeadFlagType right) const
     {
         return !right;
     }
diff --git a/examples/scan_matrix_by_rows.cu b/examples/scan_matrix_by_rows.cu
new file mode 100644
index 000000000..2cf1986e9
--- /dev/null
+++ b/examples/scan_matrix_by_rows.cu
@@ -0,0 +1,73 @@
+#include <thrust/device_vector.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <assert.h>
+
+// We have a matrix stored in a `thrust::device_vector`. We want to perform a
+// scan on each row of a matrix.
+
+__host__
+void scan_matrix_by_rows0(thrust::device_vector<int>& u, int n, int m) {
+  // Here, we launch a separate scan for each row in the matrix. This works,
+  // but each kernel only does a small amount of work. It would be better if we
+  // could launch one big kernel for the entire matrix.
+  for (int i = 0; i < n; ++i)
+    thrust::inclusive_scan(u.begin() + m * i, u.begin() + m * (i + 1),
+                           u.begin() + m * i);
+}
+
+// We can batch the operation using `thrust::inclusive_scan_by_key`, which
+// scans each group of consecutive equal keys. All we need to do is generate
+// the right key sequence. We want the keys for elements on the same row to
+// be identical.
+
+// So first, we define an unary function object which takes the index of an
+// element and returns the row that it belongs to.
+
+struct which_row : thrust::unary_function<int, int> {
+  int row_length;
+
+  __host__ __device__
+  which_row(int row_length_) : row_length(row_length_) {}
+
+  __host__ __device__
+  int operator()(int idx) const {
+    return idx / row_length;
+  }
+};
+
+__host__
+void scan_matrix_by_rows1(thrust::device_vector<int>& u, int n, int m) {
+  // This `thrust::counting_iterator` represents the index of the element.
+  thrust::counting_iterator<int> c_first(0);
+
+  // We construct a `thrust::transform_iterator` which applies the `which_row`
+  // function object to the index of each element.
+  thrust::transform_iterator<which_row, thrust::counting_iterator<int> >
+    t_first(c_first, which_row(m));
+
+  // Finally, we use our `thrust::transform_iterator` as the key sequence to
+  // `thrust::inclusive_scan_by_key`.
+  thrust::inclusive_scan_by_key(t_first, t_first + n * m, u.begin(), u.begin());
+}
+
+int main() {
+  int const n = 4;
+  int const m = 5;
+
+  thrust::device_vector<int> u0(n * m);
+  thrust::sequence(u0.begin(), u0.end());
+  scan_matrix_by_rows0(u0, n, m);
+
+  thrust::device_vector<int> u1(n * m);
+  thrust::sequence(u1.begin(), u1.end());
+  scan_matrix_by_rows1(u1, n, m);
+
+  for (int i = 0; i < n; ++i)
+    for (int j = 0; j < m; ++j)
+      assert(u0[j + m * i] == u1[j + m * i]);
+}
+
diff --git a/examples/sort.cu b/examples/sort.cu
index 700fc5f3f..1bbb5d897 100644
--- a/examples/sort.cu
+++ b/examples/sort.cu
@@ -41,7 +41,7 @@ void initialize(thrust::device_vector<int>& v1, thrust::device_vector<int>& v2)
   for(size_t i = 0; i < v1.size(); i++)
   {
     v1[i] = dist(rng);
-    v2[i] = i;
+    v2[i] = static_cast<int>(i);
   }
 }
 
diff --git a/examples/sorting_aos_vs_soa.cu b/examples/sorting_aos_vs_soa.cu
index 1bf990982..649a78ab1 100644
--- a/examples/sorting_aos_vs_soa.cu
+++ b/examples/sorting_aos_vs_soa.cu
@@ -1,3 +1,4 @@
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <thrust/sort.h>
 #include <thrust/random.h>
@@ -7,7 +8,7 @@
 
 // This examples compares sorting performance using Array of Structures (AoS)
 // and Structure of Arrays (SoA) data layout.  Legacy applications will often
-// store data in C/C++ structs, such as MyStruct defined below.  Although 
+// store data in C/C++ structs, such as MyStruct defined below.  Although
 // Thrust can process array of structs, it is typically less efficient than
 // the equivalent structure of arrays layout.  In this particular example,
 // the optimized SoA approach is approximately *five times faster* than the
@@ -57,7 +58,7 @@ int main(void)
 {
   size_t N = 2 * 1024 * 1024;
 
-  // Sort Key-Value pairs using Array of Structures (AoS) storage 
+  // Sort Key-Value pairs using Array of Structures (AoS) storage
   {
     thrust::device_vector<MyStruct> structures(N);
 
@@ -71,7 +72,7 @@ int main(void)
     std::cout << "AoS sort took " << 1e3 * t.elapsed() << " milliseconds" << std::endl;
   }
 
-  // Sort Key-Value pairs using Structure of Arrays (SoA) storage 
+  // Sort Key-Value pairs using Structure of Arrays (SoA) storage
   {
     thrust::device_vector<int>   keys(N);
     thrust::device_vector<float> values(N);
diff --git a/examples/sparse_vector.cu b/examples/sparse_vector.cu
index c7528cff2..463bfa008 100644
--- a/examples/sparse_vector.cu
+++ b/examples/sparse_vector.cu
@@ -11,7 +11,6 @@ template <typename IndexVector,
 void print_sparse_vector(const IndexVector& A_index,
                          const ValueVector& A_value)
 {
-    // sanity test
     assert(A_index.size() == A_value.size());
 
     for(size_t i = 0; i < A_index.size(); i++)
@@ -35,7 +34,6 @@ void sum_sparse_vectors(const IndexVector1& A_index,
     typedef typename IndexVector3::value_type  IndexType;
     typedef typename ValueVector3::value_type  ValueType;
 
-    // sanity test
     assert(A_index.size() == A_value.size());
     assert(B_index.size() == B_value.size());
 
@@ -53,7 +51,7 @@ void sum_sparse_vectors(const IndexVector1& A_index,
                          B_value.begin(),
                          temp_index.begin(),
                          temp_value.begin());
-    
+
     // compute number of unique indices
     size_t C_size = thrust::inner_product(temp_index.begin(), temp_index.end() - 1,
                                           temp_index.begin() + 1,
@@ -83,7 +81,7 @@ int main(void)
     A_index[1] = 3;  A_value[1] = 60;
     A_index[2] = 5;  A_value[2] = 20;
     A_index[3] = 8;  A_value[3] = 40;
-    
+
     // initialize sparse vector B with 6 elements
     thrust::device_vector<int>   B_index(6);
     thrust::device_vector<float> B_value(6);
@@ -97,7 +95,7 @@ int main(void)
     // compute sparse vector C = A + B
     thrust::device_vector<int>   C_index;
     thrust::device_vector<float> C_value;
-    
+
     sum_sparse_vectors(A_index, A_value, B_index, B_value, C_index, C_value);
 
     std::cout << "Computing C = A + B for sparse vectors A and B" << std::endl;
diff --git a/examples/strided_range.cu b/examples/strided_range.cu
index 5beb7cdf6..3457bc1ca 100644
--- a/examples/strided_range.cu
+++ b/examples/strided_range.cu
@@ -2,13 +2,10 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/functional.h>
-
 #include <thrust/fill.h>
 #include <thrust/device_vector.h>
-
-// for printing
 #include <thrust/copy.h>
-#include <ostream>
+#include <iostream>
 
 // this example illustrates how to make strided access to a range of values
 // examples:
diff --git a/examples/summary_statistics.cu b/examples/summary_statistics.cu
index a23b499f0..38785e2b7 100644
--- a/examples/summary_statistics.cu
+++ b/examples/summary_statistics.cu
@@ -5,6 +5,7 @@
 #include <thrust/extrema.h>
 #include <cmath>
 #include <limits>
+#include <iostream>
 
 // This example computes several statistical properties of a data
 // series in a single reduction.  The algorithm is described in detail here:
diff --git a/examples/summed_area_table.cu b/examples/summed_area_table.cu
index 6fe5b095a..d962df25b 100644
--- a/examples/summed_area_table.cu
+++ b/examples/summed_area_table.cu
@@ -62,7 +62,7 @@ void transpose(size_t m, size_t n, thrust::device_vector<T>& src, thrust::device
 
 // scan the rows of an M-by-N array
 template <typename T>
-void scan_horizontally(size_t m, size_t n, thrust::device_vector<T>& d_data)
+void scan_horizontally(size_t n, thrust::device_vector<T>& d_data)
 {
   thrust::counting_iterator<size_t> indices(0);
 
@@ -99,7 +99,7 @@ int main(void)
   print(m, n, data);
 
   std::cout << "[step 1] scan horizontally" << std::endl;
-  scan_horizontally(m, n, data);
+  scan_horizontally(n, data);
   print(m, n, data);
 
   std::cout << "[step 2] transpose array" << std::endl;
@@ -108,7 +108,7 @@ int main(void)
   print(n, m, temp);
 
   std::cout << "[step 3] scan transpose horizontally" << std::endl;
-  scan_horizontally(n, m, temp);
+  scan_horizontally(m, temp);
   print(n, m, temp);
 
   std::cout << "[step 4] transpose the transpose" << std::endl;
diff --git a/examples/tiled_range.cu b/examples/tiled_range.cu
index 4f570f749..51cc27d5f 100644
--- a/examples/tiled_range.cu
+++ b/examples/tiled_range.cu
@@ -2,13 +2,10 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/functional.h>
-
 #include <thrust/fill.h>
 #include <thrust/device_vector.h>
-
-// for printing
 #include <thrust/copy.h>
-#include <ostream>
+#include <iostream>
 
 // this example illustrates how to tile a range multiple times
 // examples:
diff --git a/examples/transform_input_output_iterator.cu b/examples/transform_input_output_iterator.cu
new file mode 100644
index 000000000..afdccc35a
--- /dev/null
+++ b/examples/transform_input_output_iterator.cu
@@ -0,0 +1,111 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+#include <thrust/sequence.h>
+#include <iostream>
+
+// Base 2 fixed point
+class ScaledInteger
+{
+  int value_;
+  int scale_;
+
+public:
+  __host__ __device__
+  ScaledInteger(int value, int scale): value_{value}, scale_{scale} {}
+
+  __host__ __device__
+  int value() const { return value_; }
+
+  __host__ __device__
+  ScaledInteger rescale(int scale) const
+  {
+    int shift = scale - scale_;
+    int result = shift < 0 ? value_ << (-shift) : value_ >> shift;
+    return ScaledInteger{result, scale};
+  }
+
+  __host__ __device__
+  friend ScaledInteger operator+(ScaledInteger a, ScaledInteger b)
+  {
+    // Rescale inputs to the lesser of the two scales
+    if (b.scale_ < a.scale_)
+      a = a.rescale(b.scale_);
+    else if (a.scale_ < b.scale_)
+      b = b.rescale(a.scale_);
+    return ScaledInteger{a.value_ + b.value_, a.scale_};
+  }
+};
+
+struct ValueToScaledInteger
+{
+  int scale;
+
+  __host__ __device__
+  ScaledInteger operator()(const int& value) const
+  {
+    return ScaledInteger{value, scale};
+  }
+};
+
+struct ScaledIntegerToValue
+{
+  int scale;
+
+  __host__ __device__
+  int operator()(const ScaledInteger& scaled) const
+  {
+    return scaled.rescale(scale).value();
+  }
+};
+
+int main(void)
+{
+  const size_t size = 4;
+  thrust::device_vector<int> A(size);
+  thrust::device_vector<int> B(size);
+  thrust::device_vector<int> C(size);
+
+  thrust::sequence(A.begin(), A.end(), 1);
+  thrust::sequence(B.begin(), B.end(), 5);
+
+  const int A_scale = 16; // Values in A are left shifted by 16
+  const int B_scale = 8;  // Values in B are left shifted by 8
+  const int C_scale = 4;  // Values in C are left shifted by 4
+
+  auto A_begin = thrust::make_transform_input_output_iterator(A.begin(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto A_end   = thrust::make_transform_input_output_iterator(A.end(),
+                    ValueToScaledInteger{A_scale}, ScaledIntegerToValue{A_scale});
+  auto B_begin = thrust::make_transform_input_output_iterator(B.begin(),
+                    ValueToScaledInteger{B_scale}, ScaledIntegerToValue{B_scale});
+  auto C_begin = thrust::make_transform_input_output_iterator(C.begin(),
+                    ValueToScaledInteger{C_scale}, ScaledIntegerToValue{C_scale});
+
+  // Sum A and B as ScaledIntegers, storing the scaled result in C
+  thrust::transform(A_begin, A_end, B_begin, C_begin, thrust::plus<ScaledInteger>{});
+
+  thrust::host_vector<int> A_h(A);
+  thrust::host_vector<int> B_h(B);
+  thrust::host_vector<int> C_h(C);
+
+  std::cout << std::hex;
+
+  std::cout << "Expected [ ";
+  for (size_t i = 0; i < size; i++) {
+    const int expected = ((A_h[i] << A_scale) + (B_h[i] << B_scale)) >> C_scale;
+    std::cout << expected <<  " ";
+  }
+  std::cout << "] \n";
+
+  std::cout << "Result   [ ";
+  for (size_t i = 0; i < size; i++) {
+    std::cout << C_h[i] <<  " ";
+  }
+  std::cout << "] \n";
+
+  return 0;
+}
+
diff --git a/examples/transform_output_iterator.cu b/examples/transform_output_iterator.cu
new file mode 100644
index 000000000..1c5a05e06
--- /dev/null
+++ b/examples/transform_output_iterator.cu
@@ -0,0 +1,44 @@
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <iostream>
+
+struct Functor 
+{
+  template<class Tuple>
+  __host__ __device__
+  float operator()(const Tuple& tuple) const
+  {
+    const float x = thrust::get<0>(tuple);
+    const float y = thrust::get<1>(tuple);
+    return x*y*2.0f / 3.0f;
+  }
+};
+
+int main(void)
+{
+  float u[4] = { 4 , 3,  2,   1};
+  float v[4] = {-1,  1,  1,  -1};
+  int idx[3] = {3, 0, 1};
+  float w[3] = {0, 0, 0};
+
+  thrust::device_vector<float> U(u, u + 4);
+  thrust::device_vector<float> V(v, v + 4);
+  thrust::device_vector<int> IDX(idx, idx + 3);
+  thrust::device_vector<float> W(w, w + 3);
+
+  // gather multiple elements and apply a function before writing result in memory
+  thrust::gather(
+      IDX.begin(), IDX.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(U.begin(), V.begin())),
+      thrust::make_transform_output_iterator(W.begin(), Functor()));
+
+  std::cout << "result= [ ";
+  for (size_t i = 0; i < 3; i++)
+    std::cout << W[i] <<  " ";
+  std::cout << "] \n";
+
+  return 0;
+}
+
diff --git a/examples/uninitialized_vector.cu b/examples/uninitialized_vector.cu
index d120afdc8..90e8141fa 100644
--- a/examples/uninitialized_vector.cu
+++ b/examples/uninitialized_vector.cu
@@ -4,7 +4,7 @@
 // This example demonstrates how to avoid default construction of a
 // device_vector's data by using a custom allocator.
 
-#include <thrust/device_malloc_allocator.h>
+#include <thrust/device_allocator.h>
 #include <thrust/device_vector.h>
 #include <thrust/logical.h>
 #include <thrust/functional.h>
@@ -15,12 +15,38 @@
 // no-op construct member function
 template<typename T>
   struct uninitialized_allocator
-    : thrust::device_malloc_allocator<T>
+    : thrust::device_allocator<T>
 {
+  // the default generated constructors and destructors are implicitly
+  // marked __host__ __device__, but the current Thrust device_allocator
+  // can only be constructed and destroyed on the host; therefore, we
+  // define these as host only
+  __host__
+  uninitialized_allocator() {}
+  __host__
+  uninitialized_allocator(const uninitialized_allocator & other)
+    : thrust::device_allocator<T>(other) {}
+  __host__
+  ~uninitialized_allocator() {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  uninitialized_allocator & operator=(const uninitialized_allocator &) = default;
+#endif
+
+  // for correctness, you should also redefine rebind when you inherit
+  // from an allocator type; this way, if the allocator is rebound somewhere,
+  // it's going to be rebound to the correct type - and not to its base
+  // type for U
+  template<typename U>
+  struct rebind
+  {
+    typedef uninitialized_allocator<U> other;
+  };
+
   // note that construct is annotated as
   // a __host__ __device__ function
   __host__ __device__
-  void construct(T *p)
+  void construct(T *)
   {
     // no-op
   }
diff --git a/examples/version.cu b/examples/version.cu
index d342ac864..fd0685b2d 100644
--- a/examples/version.cu
+++ b/examples/version.cu
@@ -6,8 +6,9 @@ int main(void)
     int major = THRUST_MAJOR_VERSION;
     int minor = THRUST_MINOR_VERSION;
     int subminor = THRUST_SUBMINOR_VERSION;
+    int patch = THRUST_PATCH_NUMBER;
 
-    std::cout << "Thrust v" << major << "." << minor << "." << subminor << std::endl;
+    std::cout << "Thrust v" << major << "." << minor << "." << subminor << "-" << patch << std::endl;
 
     return 0;
 }
diff --git a/generate_mk.py b/generate_mk.py
new file mode 100755
index 000000000..84071338c
--- /dev/null
+++ b/generate_mk.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python
+# Generate set of projects mk files. 
+# Usage: python generate_mk.py PROJECTS_MK_DIR  THRUST_SOURCE_DIR
+#   The program scans through unit tests and examples in THRUST_SOURCE_DIR
+#   and generates project mk for each of the tests and examples in PROJECTS_MK_DIR
+#   A single example or unit test source file generates its own executable
+#   This program is called by a top level Makefile, but can also be used stand-alone for debugging
+#   This program also generates testing.mk, examples.mk and dependencies.mk
+from __future__ import print_function
+import sys
+import shutil as sh
+import os
+import glob
+import re
+
+test_template = """
+TEST_SRC   := %(TEST_SRC)s
+TEST_NAME  := %(TEST_NAME)s
+include $(ROOTDIR)/thrust/internal/build/generic_test.mk
+"""
+example_template = """
+EXAMPLE_SRC   := %(EXAMPLE_SRC)s
+EXAMPLE_NAME  := %(EXAMPLE_NAME)s
+include $(ROOTDIR)/thrust/internal/build/generic_example.mk
+"""
+
+def Glob(pattern, directory,exclude='\B'):
+    src = glob.glob(os.path.join(directory,pattern))
+    p = re.compile(exclude)
+    src = [s for s in src if not p.match(s)]
+    return src
+
+
+def generate_test_mk(mk_path, test_path, group, TEST_DIR):
+    print('Generating makefiles in "'+mk_path+'" for tests in "'+test_path+'"')
+    src_cu  = Glob("*.cu",  test_path, ".*testframework.cu$")
+    src_cxx = Glob("*.cpp", test_path)
+    src_cu.sort();
+    src_cxx.sort();
+    src_all = src_cu + src_cxx;
+    tests_all = []
+    dependencies_all = []
+    for s in src_all:
+        fn = os.path.splitext(os.path.basename(s));
+        t = "thrust."+group+"."+fn[0]
+        e = fn[1]
+        mkfile = test_template % {"TEST_SRC" : s,  "TEST_NAME" : t}
+        f = open(os.path.join(mk_path,t+".mk"), 'w')
+        f.write(mkfile)
+        f.close()
+        tests_all.append(os.path.join(mk_path,t))
+        dependencies_all.append(t+": testframework")
+    return [tests_all, dependencies_all]
+
+def generate_example_mk(mk_path, example_path, group, EXAMPLE_DIR):
+    print('Generating makefiles in "'+mk_path+'" for examples in "'+example_path+'"')
+    src_cu  = Glob("*.cu",  example_path)
+    src_cxx = Glob("*.cpp", example_path)
+    src_cu.sort();
+    src_cxx.sort();
+    src_all = src_cu + src_cxx;
+    examples_all = []
+    for s in src_all:
+        fn = os.path.splitext(os.path.basename(s));
+        t = "thrust."+group+"."+fn[0]
+        e = fn[1]
+        mkfile = example_template % {"EXAMPLE_SRC" : s, "EXAMPLE_NAME" : t}
+        f = open(os.path.join(mk_path,t+".mk"), 'w')
+        f.write(mkfile)
+        f.close()
+        examples_all.append(os.path.join(mk_path,t))
+    return examples_all
+
+
+## relpath : backported from os.relpath form python 2.6+
+def relpath(path, start):
+    """Return a relative version of a path"""
+
+    import posixpath
+    if not path:
+        raise ValueError("no path specified")
+    start_list = posixpath.abspath(start).split(posixpath.sep)
+    path_list = posixpath.abspath(path).split(posixpath.sep)
+    # Work out how much of the filepath is shared by start and path.
+    i = len(posixpath.commonprefix([start_list, path_list]))
+    rel_list = [posixpath.pardir] * (len(start_list)-i) + path_list[i:]
+    if not rel_list:
+        return posixpath.curdir
+    return posixpath.join(*rel_list)
+
+mk_path=sys.argv[1]
+REL_DIR="../../"
+if (len(sys.argv) > 2):
+    root_path=sys.argv[2];
+    mk_path = relpath(mk_path, root_path)
+    REL_DIR = relpath(root_path,mk_path)
+
+try:
+    sh.rmtree(mk_path)
+except:
+    pass
+os.makedirs(mk_path)
+
+tests_all, dependencies_all = generate_test_mk(mk_path, "testing/", "test", REL_DIR)
+tests_cu,  dependencies_cu  = generate_test_mk(mk_path, "testing/cuda/", "test.cuda", REL_DIR)
+tests_all.extend(tests_cu)
+dependencies_all.extend(dependencies_cu)
+
+testing_mk  = ""
+
+for t in tests_all:
+    testing_mk += "PROJECTS += "+t+"\n"
+testing_mk += "PROJECTS += internal/build/testframework\n"
+
+
+f = open(os.path.join(mk_path,"testing.mk"),'w')
+f.write(testing_mk)
+f.close()
+
+dependencies_mk = ""
+for d in dependencies_all:
+    dependencies_mk += d + "\n"
+
+f = open(os.path.join(mk_path,"dependencies.mk"),'w')
+f.write(dependencies_mk)
+f.close()
+
+
+examples_mk = ""
+examples_all  = generate_example_mk(mk_path, "examples/", "example", REL_DIR)
+examples_cuda = generate_example_mk(mk_path, "examples/cuda/", "example.cuda", REL_DIR)
+examples_all.extend(examples_cuda)
+for e in examples_all:
+    examples_mk += "PROJECTS += "+e+"\n"
+
+f = open(os.path.join(mk_path,"examples.mk"),'w')
+f.write(examples_mk)
+f.close()
+
+
+
+
+
+
+
+
diff --git a/internal/benchmark/CMakeLists.txt b/internal/benchmark/CMakeLists.txt
new file mode 100644
index 000000000..8c59747b8
--- /dev/null
+++ b/internal/benchmark/CMakeLists.txt
@@ -0,0 +1,30 @@
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  # MSVC builds fail at runtime. Benchmarks are linux-only for now.
+  message(STATUS "Thrust benchmarking is not available on MSVC.")
+  return()
+endif()
+
+add_custom_target(thrust.all.bench)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Skip non cpp.cuda targets:
+  if (NOT config_host   STREQUAL "CPP" OR
+      NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  set(bench_target ${config_prefix}.bench)
+
+  add_executable(${bench_target} bench.cu)
+  target_link_libraries(${bench_target} PRIVATE ${thrust_target})
+  target_include_directories(${bench_target} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+  thrust_clone_target_properties(${bench_target} ${thrust_target})
+  thrust_fix_clang_nvcc_build_for(${bench_target})
+
+  add_dependencies(thrust.all.bench ${bench_target})
+  add_dependencies(${config_prefix}.all ${bench_target})
+endforeach()
diff --git a/internal/benchmark/README.txt b/internal/benchmark/README.txt
new file mode 100644
index 000000000..73b0cc058
--- /dev/null
+++ b/internal/benchmark/README.txt
@@ -0,0 +1,31 @@
+Directions for compiling and running the benchmark with Ubuntu Linux:
+
+Install Intel's Threading Building Blocks library (TBB):
+$ sudo apt-get install libtbb-dev
+
+Compile the benchmark:
+$ nvcc -O3 -arch=sm_20 bench.cu -ltbb -o bench
+
+Run the benchmark:
+$ ./bench
+
+Typical output (Tesla C2050):
+
+Benchmarking with input size 33554432
+Core Primitive Performance (elements per second)
+      Algorithm,          STL,          TBB,       Thrust
+         reduce,   3121746688,   3739585536,  26134038528
+      transform,   1869492736,   2347719424,  13804681216
+           scan,   1394143744,   1439394816,   5039195648
+           sort,     11070660,     34622352,    673543168
+Sorting Performance (keys per second)
+  Type,          STL,          TBB,       Thrust
+  char,     24050078,     62987040,   2798874368
+ short,     15644141,     41275164,   1428603008
+   int,     11062616,     33478628,    682295744
+  long,     11249874,     33972564,    219719184
+ float,      9850043,     29011806,    692407232
+double,      9700181,     27153626,    224345568
+
+The reported numbers are performance rates in "elements per second" (higher is better).
+
diff --git a/internal/benchmark/bench.cu b/internal/benchmark/bench.cu
new file mode 100644
index 000000000..38d1d647a
--- /dev/null
+++ b/internal/benchmark/bench.cu
@@ -0,0 +1,1274 @@
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/pair.h>
+#include <thrust/sort.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/random.h>
+#include <thrust/shuffle.h>
+
+#include <random>
+#endif
+
+#include <algorithm>
+#include <numeric>
+
+#include <map>
+#include <string>
+#include <exception>
+
+#include <iostream>
+
+#include <cassert>
+#include <cstdlib>    // For `atoi`.
+#include <climits>    // For CHAR_BIT.
+#include <cmath>      // For `sqrt` and `abs`.
+
+#include <stdint.h>   // For `intN_t`.
+
+#include "random.h"
+#include "timer.h"
+
+#if defined(HAVE_TBB)
+  #include "tbb_algos.h"
+#endif
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+  #include <thrust/system_error.h>      // For `thrust::system_error`
+  #include <thrust/system/cuda/error.h> // For `thrust::cuda_category`
+#endif
+
+// We don't use THRUST_PP_STRINGIZE and THRUST_PP_CAT because they are new, and
+// we want this benchmark to be backwards-compatible to older versions of Thrust.
+#define PP_STRINGIZE_(expr) #expr
+#define PP_STRINGIZE(expr)  PP_STRINGIZE_(expr)
+
+#define PP_CAT(a, b) a ## b
+
+// We don't use THRUST_NOEXCEPT because it's new, and we want this benchmark to
+// be backwards-compatible to older versions of Thrust.
+#if THRUST_CPP_DIALECT >= 2011
+  #define NOEXCEPT noexcept
+#else
+  #define NOEXCEPT throw()
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct squared_difference
+{
+private:
+  T const average;
+
+public:
+  __host__ __device__
+  squared_difference(squared_difference const& rhs) : average(rhs.average) {}
+
+  __host__ __device__
+  squared_difference(T average_) : average(average_) {}
+
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return (x - average) * (x - average);
+  }
+};
+
+template <typename T>
+struct value_and_count
+{
+  T           value;
+  uint64_t count;
+
+  __host__ __device__
+  value_and_count(value_and_count const& other)
+    : value(other.value), count(other.count) {}
+
+  __host__ __device__
+  value_and_count(T const& value_)
+    : value(value_), count(1) {}
+
+  __host__ __device__
+  value_and_count(T const& value_, uint64_t count_)
+    : value(value_), count(count_) {}
+
+  __host__ __device__
+  value_and_count& operator=(value_and_count const& other)
+  {
+    value = other.value;
+    count = other.count;
+    return *this;
+  }
+
+  __host__ __device__
+  value_and_count& operator=(T const& value_)
+  {
+    value = value_;
+    count = 1;
+    return *this;
+  }
+};
+
+template <typename T, typename ReduceOp>
+struct counting_op
+{
+private:
+  ReduceOp reduce;
+
+public:
+  __host__ __device__
+  counting_op() : reduce() {}
+
+  __host__ __device__
+  counting_op(counting_op const& other) : reduce(other.reduce) {}
+
+  __host__ __device__
+  counting_op(ReduceOp const& reduce_) : reduce(reduce_) {}
+
+  __host__ __device__
+  value_and_count<T> operator()(
+      value_and_count<T> const& x
+    , T const&                  y
+    ) const
+  {
+    return value_and_count<T>(reduce(x.value, y), x.count + 1);
+  }
+
+  __host__ __device__
+  value_and_count<T> operator()(
+      value_and_count<T> const& x
+    , value_and_count<T> const& y
+    ) const
+  {
+    return value_and_count<T>(reduce(x.value, y.value), x.count + y.count);
+  }
+};
+
+template <typename InputIt, typename T>
+T arithmetic_mean(InputIt first, InputIt last, T init)
+{
+  value_and_count<T> init_vc(init, 0);
+
+  counting_op<T, thrust::plus<T> > reduce_vc;
+
+  value_and_count<T> vc
+    = thrust::reduce(first, last, init_vc, reduce_vc);
+
+  return vc.value / vc.count;
+}
+
+template <typename InputIt>
+typename thrust::iterator_traits<InputIt>::value_type
+arithmetic_mean(InputIt first, InputIt last)
+{
+  typedef typename thrust::iterator_traits<InputIt>::value_type T;
+  return arithmetic_mean(first, last, T());
+}
+
+template <typename InputIt, typename T>
+T sample_standard_deviation(InputIt first, InputIt last, T average)
+{
+  value_and_count<T> init_vc(T(), 0);
+
+  counting_op<T, thrust::plus<T> > reduce_vc;
+
+  squared_difference<T> transform(average);
+
+  value_and_count<T> vc
+    = thrust::transform_reduce(first, last, transform, init_vc, reduce_vc);
+
+  return std::sqrt(vc.value / T(vc.count - 1));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Formulas for propagation of uncertainty from:
+//
+//   https://en.wikipedia.org/wiki/Propagation_of_uncertainty#Example_formulas
+//
+// Even though it's Wikipedia, I trust it as I helped write that table.
+//
+// XXX Replace with a proper reference.
+
+// Compute the propagated uncertainty from the multiplication of two uncertain
+// values, `A +/- A_unc` and `B +/- B_unc`. Given `f = AB` or `f = A/B`, where
+// `A != 0` and `B != 0`, the uncertainty in `f` is approximately:
+//
+//   f_unc = abs(f) * sqrt((A_unc / A) ^ 2 + (B_unc / B) ^ 2)
+//
+template <typename T>
+__host__ __device__
+T uncertainty_multiplicative(
+    T const& f
+  , T const& A, T const& A_unc
+  , T const& B, T const& B_unc
+    )
+{
+  return std::abs(f)
+       * std::sqrt((A_unc / A) * (A_unc / A) + (B_unc / B) * (B_unc / B));
+}
+
+// Compute the propagated uncertainty from addition of two uncertain values,
+// `A +/- A_unc` and `B +/- B_unc`. Given `f = cA + dB` (where `c` and `d` are
+// certain constants), the uncertainty in `f` is approximately:
+//
+//   f_unc = sqrt(c ^ 2 * A_unc ^ 2 + d ^ 2 * B_unc ^ 2)
+//
+template <typename T>
+__host__ __device__
+T uncertainty_additive(
+    T const& c, T const& A_unc
+  , T const& d, T const& B_unc
+    )
+{
+  return std::sqrt((c * c * A_unc * A_unc) + (d * d * B_unc * B_unc));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Return the significant digit of `x`. The result is the number of digits
+// after the decimal place to round to (negative numbers indicate rounding
+// before the decimal place)
+template <typename T>
+int find_significant_digit(T x)
+{
+  if (x == T(0)) return T(0);
+  return -int(std::floor(std::log10(std::abs(x))));
+}
+
+// Round `x` to `ndigits` after the decimal place (Python-style).
+template <typename T, typename N>
+T round_to_precision(T x, N ndigits)
+{
+  double m = (x < 0.0) ? -1.0 : 1.0;
+  double pwr = std::pow(T(10.0), ndigits);
+  return (std::floor(x * m * pwr + 0.5) / pwr) * m;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+void print_experiment_header()
+{ // {{{
+  std::cout << "Thrust Version"
+    << ","  << "Algorithm"
+    << ","  << "Element Type"
+    << ","  << "Element Size"
+    << ","  << "Elements per Trial"
+    << ","  << "Total Input Size"
+    << ","  << "STL Trials"
+    << ","  << "STL Average Walltime"
+    << ","  << "STL Walltime Uncertainty"
+    << ","  << "STL Average Throughput"
+    << ","  << "STL Throughput Uncertainty"
+    << ","  << "Thrust Trials"
+    << ","  << "Thrust Average Walltime"
+    << ","  << "Thrust Walltime Uncertainty"
+    << ","  << "Thrust Average Throughput"
+    << ","  << "Thrust Throughput Uncertainty"
+    #if defined(HAVE_TBB)
+    << ","  << "TBB Trials"
+    << ","  << "TBB Average Walltime"
+    << ","  << "TBB Walltime Uncertainty"
+    << ","  << "TBB Average Throughput"
+    << ","  << "TBB Throughput Uncertainty"
+    #endif
+    << std::endl;
+
+  std::cout << ""                // Thrust Version.
+    << ","  << ""                // Algorithm.
+    << ","  << ""                // Element Type.
+    << ","  << "bits/element"    // Element Size.
+    << ","  << "elements"        // Elements per Trial.
+    << ","  << "MiBs"            // Total Input Size.
+    << ","  << "trials"          // STL Trials.
+    << ","  << "secs"            // STL Average Walltime.
+    << ","  << "secs"            // STL Walltime Uncertainty.
+    << ","  << "elements/sec"    // STL Average Throughput.
+    << ","  << "elements/sec"    // STL Throughput Uncertainty.
+    << ","  << "trials"          // Thrust Trials.
+    << ","  << "secs"            // Thrust Average Walltime.
+    << ","  << "secs"            // Thrust Walltime Uncertainty.
+    << ","  << "elements/sec"    // Thrust Average Throughput.
+    << ","  << "elements/sec"    // Thrust Throughput Uncertainty.
+    #if defined(HAVE_TBB)
+    << ","  << "trials"          // TBB Trials.
+    << ","  << "secs"            // TBB Average Walltime.
+    << ","  << "secs"            // TBB Walltime Uncertainty.
+    << ","  << "elements/sec"    // TBB Average Throughput.
+    << ","  << "elements/sec"    // TBB Throughput Uncertainty.
+    #endif
+    << std::endl;
+} // }}}
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct experiment_results
+{
+  double const average_time; // Arithmetic mean of trial times in seconds.
+  double const stdev_time;   // Sample standard deviation of trial times.
+
+  experiment_results(double average_time_, double stdev_time_)
+    : average_time(average_time_), stdev_time(stdev_time_) {}
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType // Has an embedded typedef `type,
+                                              // and a static method `name` that
+                                              // returns a char const*.
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+struct experiment_driver
+{
+  typedef typename ElementMetaType::type element_type;
+
+  static char const* const test_name;
+  static char const* const element_type_name; // Element type name as a string.
+
+  static uint64_t const elements;             // # of elements per trial.
+  static uint64_t const element_size;         // Size of each element in bits.
+  static double   const input_size;           // `elements` * `element_size` in MiB.
+  static uint64_t const baseline_trials;      // # of baseline trials per experiment.
+  static uint64_t const regular_trials;       // # of regular trials per experiment.
+
+  static void run_experiment()
+  { // {{{
+    experiment_results stl    = std_experiment();
+    experiment_results thrust = thrust_experiment();
+    #if defined(HAVE_TBB)
+    experiment_results tbb    = tbb_experiment();
+    #endif
+
+    double stl_average_walltime    = stl.average_time;
+    double thrust_average_walltime = thrust.average_time;
+    #if defined(HAVE_TBB)
+    double tbb_average_walltime    = tbb.average_time;
+    #endif
+
+    double stl_average_throughput    = elements / stl.average_time;
+    double thrust_average_throughput = elements / thrust.average_time;
+    #if defined(HAVE_TBB)
+    double tbb_average_throughput    = elements / tbb.average_time;
+    #endif
+
+    double stl_walltime_uncertainty    = stl.stdev_time;
+    double thrust_walltime_uncertainty = thrust.stdev_time;
+    #if defined(HAVE_TBB)
+    double tbb_walltime_uncertainty    = tbb.stdev_time;
+    #endif
+
+    double stl_throughput_uncertainty    = uncertainty_multiplicative(
+        stl_average_throughput
+      , double(elements), 0.0
+      , stl_average_walltime, stl_walltime_uncertainty
+    );
+    double thrust_throughput_uncertainty = uncertainty_multiplicative(
+        thrust_average_throughput
+      , double(elements), 0.0
+      , thrust_average_walltime, thrust_walltime_uncertainty
+    );
+
+    #if defined(HAVE_TBB)
+    double tbb_throughput_uncertainty    = uncertainty_multiplicative(
+        tbb_average_throughput
+      , double(elements), 0.0
+      , tbb_average_walltime, tbb_walltime_uncertainty
+    );
+    #endif
+
+    // Round the average walltime and walltime uncertainty to the
+    // significant figure of the walltime uncertainty.
+    int stl_walltime_precision = std::max(
+        find_significant_digit(stl.average_time)
+      , find_significant_digit(stl.stdev_time)
+    );
+    int thrust_walltime_precision = std::max(
+        find_significant_digit(thrust.average_time)
+      , find_significant_digit(thrust.stdev_time)
+    );
+    #if defined(HAVE_TBB)
+    int tbb_walltime_precision = std::max(
+        find_significant_digit(tbb.average_time)
+      , find_significant_digit(tbb.stdev_time)
+    );
+    #endif
+
+    stl_average_walltime = round_to_precision(
+        stl_average_walltime, stl_walltime_precision
+    );
+    thrust_average_walltime = round_to_precision(
+        thrust_average_walltime, thrust_walltime_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_average_walltime = round_to_precision(
+        tbb_average_walltime, tbb_walltime_precision
+    );
+    #endif
+
+    stl_walltime_uncertainty = round_to_precision(
+        stl_walltime_uncertainty, stl_walltime_precision
+    );
+    thrust_walltime_uncertainty = round_to_precision(
+        thrust_walltime_uncertainty, thrust_walltime_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_walltime_uncertainty = round_to_precision(
+        tbb_walltime_uncertainty, tbb_walltime_precision
+    );
+    #endif
+
+    // Round the average throughput and throughput uncertainty to the
+    // significant figure of the throughput uncertainty.
+    int stl_throughput_precision = std::max(
+        find_significant_digit(stl_average_throughput)
+      , find_significant_digit(stl_throughput_uncertainty)
+    );
+    int thrust_throughput_precision = std::max(
+        find_significant_digit(thrust_average_throughput)
+      , find_significant_digit(thrust_throughput_uncertainty)
+    );
+    #if defined(HAVE_TBB)
+    int tbb_throughput_precision = std::max(
+        find_significant_digit(tbb_average_throughput)
+      , find_significant_digit(tbb_throughput_uncertainty)
+    );
+    #endif
+
+    stl_average_throughput = round_to_precision(
+        stl_average_throughput, stl_throughput_precision
+    );
+    thrust_average_throughput = round_to_precision(
+        thrust_average_throughput, thrust_throughput_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_average_throughput = round_to_precision(
+        tbb_average_throughput, tbb_throughput_precision
+    );
+    #endif
+
+    stl_throughput_uncertainty = round_to_precision(
+        stl_throughput_uncertainty, stl_throughput_precision
+    );
+    thrust_throughput_uncertainty = round_to_precision(
+        thrust_throughput_uncertainty, thrust_throughput_precision
+    );
+    #if defined(HAVE_TBB)
+    tbb_throughput_uncertainty = round_to_precision(
+        tbb_throughput_uncertainty, tbb_throughput_precision
+    );
+    #endif
+
+    std::cout << THRUST_VERSION                // Thrust Version.
+      << ","  << test_name                     // Algorithm.
+      << ","  << element_type_name             // Element Type.
+      << ","  << element_size                  // Element Size.
+      << ","  << elements                      // Elements per Trial.
+      << ","  << input_size                    // Total Input Size.
+      << ","  << baseline_trials               // STL Trials.
+      << ","  << stl_average_walltime          // STL Average Walltime.
+      << ","  << stl_walltime_uncertainty      // STL Walltime Uncertainty.
+      << ","  << stl_average_throughput        // STL Average Throughput.
+      << ","  << stl_throughput_uncertainty    // STL Throughput Uncertainty.
+      << ","  << regular_trials                // Thrust Trials.
+      << ","  << thrust_average_walltime       // Thrust Average Walltime.
+      << ","  << thrust_walltime_uncertainty   // Thrust Walltime Uncertainty.
+      << ","  << thrust_average_throughput     // Thrust Average Throughput.
+      << ","  << thrust_throughput_uncertainty // Thrust Throughput Uncertainty.
+      #if defined(HAVE_TBB)
+      << ","  << regular_trials                // TBB Trials.
+      << ","  << tbb_average_walltime          // TBB Average Walltime.
+      << ","  << tbb_walltime_uncertainty      // TBB Walltime Uncertainty.
+      << ","  << tbb_average_throughput        // TBB Average Throughput.
+      << ","  << tbb_throughput_uncertainty    // TBB Throughput Uncertainty.
+      #endif
+      << std::endl;
+  } // }}}
+
+private:
+  static experiment_results std_experiment()
+  {
+    return experiment<typename Test<element_type>::std_trial>();
+  }
+
+  static experiment_results thrust_experiment()
+  {
+    return experiment<typename Test<element_type>::thrust_trial>();
+  }
+
+  #if defined(HAVE_TBB)
+  static experiment_results tbb_experiment()
+  {
+    return experiment<typename Test<element_type>::tbb_trial>();
+  }
+  #endif
+
+  template <typename Trial>
+  static experiment_results experiment()
+  { // {{{
+    Trial trial;
+
+    // Allocate storage and generate random input for the warmup trial.
+    trial.setup(elements);
+
+    // Warmup trial.
+    trial();
+
+    uint64_t const trials
+      = trial.is_baseline() ? baseline_trials : regular_trials;
+
+    std::vector<double> times;
+    times.reserve(trials);
+
+    for (uint64_t t = 0; t < trials; ++t)
+    {
+      // Generate random input for next trial.
+      trial.setup(elements);
+
+      steady_timer e;
+
+      // Benchmark.
+      e.start();
+      trial();
+      e.stop();
+
+      times.push_back(e.seconds_elapsed());
+    }
+
+    double average_time
+      = arithmetic_mean(times.begin(), times.end());
+
+    double stdev_time
+      = sample_standard_deviation(times.begin(), times.end(), average_time);
+
+    return experiment_results(average_time, stdev_time);
+  } // }}}
+};
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+char const* const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::test_name
+  = Test<typename ElementMetaType::type>::test_name();
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+char const* const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::element_type_name
+  = ElementMetaType::name();
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::element_size
+  = CHAR_BIT * sizeof(typename ElementMetaType::type);
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::elements
+  = Elements;
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+double const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::input_size
+  = double( Elements /* [elements] */
+          * sizeof(typename ElementMetaType::type) /* [bytes/element] */
+          )
+  / double(1024 * 1024 /* [bytes/MiB] */);
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::baseline_trials
+  = BaselineTrials;
+
+template <
+    template <typename> class Test
+  , typename                  ElementMetaType
+  , uint64_t                  Elements
+  , uint64_t                  BaselineTrials
+  , uint64_t                  RegularTrials
+>
+uint64_t const
+experiment_driver<
+  Test, ElementMetaType, Elements, BaselineTrials, RegularTrials
+>::regular_trials
+  = RegularTrials;
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Never create variables, pointers or references of any of the `*_trial_base`
+// classes. They are purely mixin base classes and do not have vtables and
+// virtual destructors. Using them for polymorphism instead of composition will
+// probably cause slicing.
+
+struct baseline_trial {};
+struct regular_trial {};
+
+template <typename TrialKind = regular_trial>
+struct trial_base;
+
+template <>
+struct trial_base<baseline_trial>
+{
+  static bool is_baseline() { return true; }
+};
+
+template <>
+struct trial_base<regular_trial>
+{
+  static bool is_baseline() { return false; }
+};
+
+template <typename Container, typename TrialKind = regular_trial>
+struct inplace_trial_base : trial_base<TrialKind>
+{
+  Container input;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+
+    randomize(input);
+  }
+};
+
+template <typename Container, typename TrialKind = regular_trial>
+struct copy_trial_base : trial_base<TrialKind>
+{
+  Container input;
+  Container output;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+    output.resize(elements);
+
+    randomize(input);
+  }
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename Container, typename TrialKind = regular_trial>
+struct shuffle_trial_base : trial_base<TrialKind>
+{
+  Container input;
+
+  void setup(uint64_t elements)
+  {
+    input.resize(elements);
+
+    randomize(input);
+  }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct reduce_tester
+{
+  static char const* test_name() { return "reduce"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      if (std::accumulate(this->input.begin(), this->input.end(), T(0)) == 0)
+        // Prevent optimizer from removing body.
+        std::cout << "xyz";
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::reduce(this->input.begin(), this->input.end());
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_reduce(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct sort_tester
+{
+  static char const* test_name() { return "sort"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::sort(this->input.begin(), this->input.end());
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::sort(this->input.begin(), this->input.end());
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_sort(this->input);
+    }
+  }
+  #endif
+};
+
+
+template <typename T>
+struct transform_inplace_tester
+{
+  static char const* test_name() { return "transform_inplace"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::transform(
+          this->input.begin(), this->input.end(), this->input.begin()
+        , thrust::negate<T>()
+      );
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::transform(
+          this->input.begin(), this->input.end(), this->input.begin()
+        , thrust::negate<T>()
+      );
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_transform(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct inclusive_scan_inplace_tester
+{
+  static char const* test_name() { return "inclusive_scan_inplace"; }
+
+  struct std_trial : inplace_trial_base<std::vector<T>, baseline_trial>
+  {
+    void operator()()
+    {
+      std::partial_sum(
+          this->input.begin(), this->input.end(), this->input.begin()
+      );
+    }
+  };
+
+  struct thrust_trial : inplace_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::inclusive_scan(
+          this->input.begin(), this->input.end(), this->input.begin()
+      );
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : inplace_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_scan(this->input);
+    }
+  };
+  #endif
+};
+
+template <typename T>
+struct copy_tester
+{
+  static char const* test_name() { return "copy"; }
+
+  struct std_trial : copy_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      std::copy(this->input.begin(), this->input.end(), this->output.begin());
+    }
+  };
+
+  struct thrust_trial : copy_trial_base<thrust::device_vector<T> >
+  {
+    void operator()()
+    {
+      thrust::copy(this->input.begin(), this->input.end(), this->input.begin());
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+
+  #if defined(HAVE_TBB)
+  struct tbb_trial : copy_trial_base<std::vector<T> >
+  {
+    void operator()()
+    {
+      tbb_copy(this->input, this->output);
+    }
+  };
+  #endif
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template <typename T>
+struct shuffle_tester
+{
+  static char const* test_name() { return "shuffle"; }
+
+  struct std_trial : shuffle_trial_base<std::vector<T>, baseline_trial>
+  {
+    std::default_random_engine g;
+    void operator()()
+    {
+      std::shuffle(this->input.begin(), this->input.end(), this->g);
+    }
+  };
+
+  struct thrust_trial : shuffle_trial_base<thrust::device_vector<T> >
+  {
+    thrust::default_random_engine g;
+    void operator()()
+    {
+      thrust::shuffle(this->input.begin(), this->input.end(), this->g);
+      #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cudaError_t err = cudaDeviceSynchronize();
+        if (err != cudaSuccess)
+          throw thrust::error_code(err, thrust::cuda_category());
+      #endif
+    }
+  };
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename ElementMetaType
+  , uint64_t Elements
+  , uint64_t BaselineTrials
+  , uint64_t RegularTrials
+>
+void run_core_primitives_experiments_for_type()
+{
+  experiment_driver<
+      reduce_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+    transform_inplace_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      inclusive_scan_inplace_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      sort_tester
+    , ElementMetaType
+//    , Elements / sizeof(typename ElementMetaType::type)
+    , (Elements >> 6) // Sorting is more sensitive to element count than
+                      // memory footprint.
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      copy_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+
+  experiment_driver<
+      shuffle_tester
+    , ElementMetaType
+    , Elements / sizeof(typename ElementMetaType::type)
+    , BaselineTrials
+    , RegularTrials
+  >::run_experiment();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define DEFINE_ELEMENT_META_TYPE(T)                       \
+  struct PP_CAT(T, _meta)                                 \
+  {                                                       \
+    typedef T type;                                       \
+                                                          \
+    static char const* name() { return PP_STRINGIZE(T); } \
+  };                                                      \
+  /**/
+
+DEFINE_ELEMENT_META_TYPE(char);
+DEFINE_ELEMENT_META_TYPE(int);
+DEFINE_ELEMENT_META_TYPE(int8_t);
+DEFINE_ELEMENT_META_TYPE(int16_t);
+DEFINE_ELEMENT_META_TYPE(int32_t);
+DEFINE_ELEMENT_META_TYPE(int64_t);
+DEFINE_ELEMENT_META_TYPE(float);
+DEFINE_ELEMENT_META_TYPE(double);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+    uint64_t Elements
+  , uint64_t BaselineTrials
+  , uint64_t RegularTrials
+>
+void run_core_primitives_experiments()
+{
+  run_core_primitives_experiments_for_type<
+    char_meta,    Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int_meta,     Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int8_t_meta,  Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int16_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int32_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    int64_t_meta, Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    float_meta,   Elements, BaselineTrials, RegularTrials
+  >();
+  run_core_primitives_experiments_for_type<
+    double_meta,  Elements, BaselineTrials, RegularTrials
+  >();
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// XXX Use `std::string_view` when possible.
+std::vector<std::string> split(std::string const& str, std::string const& delim)
+{
+  std::vector<std::string> tokens;
+  std::string::size_type prev = 0, pos = 0;
+  do
+  {
+    pos = str.find(delim, prev);
+    if (pos == std::string::npos) pos = str.length();
+    std::string token = str.substr(prev, pos - prev);
+    if (!token.empty()) tokens.push_back(token);
+    prev = pos + delim.length();
+  }
+  while (pos < str.length() && prev < str.length());
+  return tokens;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct command_line_option_error : std::exception
+{
+  virtual ~command_line_option_error() NOEXCEPT {}
+  virtual const char* what() const NOEXCEPT = 0;
+};
+
+struct only_one_option_allowed : command_line_option_error
+{
+  // Construct a new `only_one_option_allowed` exception. `key` is the
+  // option name and `[first, last)` is a sequence of
+  // `std::pair<std::string const, std::string>`s (the values).
+  template <typename InputIt>
+  only_one_option_allowed(std::string const& key, InputIt first, InputIt last)
+    : message()
+  {
+    message  = "Only one `--";
+    message += key;
+    message += "` option is allowed, but multiple were received: ";
+
+    for (; first != last; ++first)
+    {
+      message += "`";
+      message += (*first).second;
+      message += "` ";
+    }
+
+    // Remove the trailing space added by the last iteration of the above loop.
+    message.erase(message.size() - 1, 1);
+
+    message += ".";
+  }
+
+  virtual ~only_one_option_allowed() NOEXCEPT {}
+
+  virtual const char* what() const NOEXCEPT
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+struct required_option_missing : command_line_option_error
+{
+  // Construct a new `requirement_option_missing` exception. `key` is the
+  // option name.
+  required_option_missing(std::string const& key)
+    : message()
+  {
+    message  = "`--";
+    message += key;
+    message += "` option is required.";
+  }
+
+  virtual ~required_option_missing() NOEXCEPT {}
+
+  virtual const char* what() const NOEXCEPT
+  {
+    return message.c_str();
+  }
+
+private:
+  std::string message;
+};
+
+struct command_line_processor
+{
+  typedef std::vector<std::string> positional_options_type;
+
+  typedef std::multimap<std::string, std::string> keyword_options_type;
+
+  typedef std::pair<
+    keyword_options_type::const_iterator
+  , keyword_options_type::const_iterator
+  > keyword_option_values;
+
+  command_line_processor(int argc, char** argv)
+    : pos_args(), kw_args()
+  { // {{{
+    for (int i = 1; i < argc; ++i)
+    {
+      std::string arg(argv[i]);
+
+      // Look for --key or --key=value options.
+      if (arg.substr(0, 2) == "--")
+      {
+        std::string::size_type n = arg.find('=', 2);
+
+        keyword_options_type::value_type key_value;
+
+        if (n == std::string::npos) // --key
+          kw_args.insert(keyword_options_type::value_type(
+            arg.substr(2), ""
+          ));
+        else                        // --key=value
+          kw_args.insert(keyword_options_type::value_type(
+            arg.substr(2, n - 2), arg.substr(n + 1)
+          ));
+
+        kw_args.insert(key_value);
+      }
+      else // Assume it's positional.
+        pos_args.push_back(arg);
+    }
+  } // }}}
+
+  // Return the value for option `key`.
+  //
+  // Throws:
+  // * `only_one_option_allowed` if there is more than one value for `key`.
+  // * `required_option_missing` if there is no value for `key`.
+  std::string operator()(std::string const& key) const
+  {
+    keyword_option_values v = kw_args.equal_range(key);
+
+    keyword_options_type::difference_type d = std::distance(v.first, v.second);
+
+    if      (1 < d)  // Too many options.
+      throw only_one_option_allowed(key, v.first, v.second);
+    else if (0 == d) // No option.
+      throw required_option_missing(key);
+
+    return (*v.first).second;
+  }
+
+  // Return the value for option `key`, or `dflt` if `key` has no value.
+  //
+  // Throws: `only_one_option_allowed` if there is more than one value for `key`.
+  std::string operator()(std::string const& key, std::string const& dflt) const
+  {
+    keyword_option_values v = kw_args.equal_range(key);
+
+    keyword_options_type::difference_type d = std::distance(v.first, v.second);
+
+    if (1 < d)  // Too many options.
+      throw only_one_option_allowed(key, v.first, v.second);
+
+    if (0 == d) // No option.
+      return dflt;
+    else        // 1 option.
+      return (*v.first).second;
+  }
+
+  // Returns `true` if the option `key` was specified at least once.
+  bool has(std::string const& key) const
+  {
+    return kw_args.count(key) > 0;
+  }
+
+private:
+  positional_options_type pos_args;
+  keyword_options_type    kw_args;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, char** argv)
+{
+  command_line_processor clp(argc, argv);
+
+  #if defined(HAVE_TBB)
+  tbb::task_scheduler_init init;
+
+  test_tbb();
+  #endif
+
+  #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    // Set the CUDA device to use for the benchmark - `0` by default.
+
+    int device = std::atoi(clp("device", "0").c_str());
+    // `std::atoi` returns 0 if the conversion fails.
+
+    cudaSetDevice(device);
+  #endif
+
+  if (!clp.has("no-header"))
+    print_experiment_header();
+
+                                          /* Elements |       Trials       */
+                                          /*          | Baseline | Regular */
+//run_core_primitives_experiments< 1LLU << 21LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 22LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 23LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 24LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 25LLU      , 4        , 16      >();
+  run_core_primitives_experiments< 1LLU << 26LLU      , 4        , 16      >();
+  run_core_primitives_experiments< 1LLU << 27LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 28LLU      , 4        , 16      >();
+//run_core_primitives_experiments< 1LLU << 29LLU      , 4        , 16      >();
+
+  return 0;
+}
+
+// TODO: Add different input sizes and half precision
diff --git a/internal/benchmark/bench.mk b/internal/benchmark/bench.mk
new file mode 100644
index 000000000..2a5c002bc
--- /dev/null
+++ b/internal/benchmark/bench.mk
@@ -0,0 +1,20 @@
+# XXX Use the common Thrust Makefiles instead of this.
+
+EXECUTABLE := bench
+BUILD_SRC  := $(ROOTDIR)/thrust/internal/benchmark/bench.cu
+
+ifeq ($(OS),Linux)
+  LIBRARIES += m
+endif
+
+# XXX Why is this needed?
+ifeq ($(OS),Linux)
+  ifeq ($(ABITYPE), androideabi)
+    override ALL_SASS_ARCHITECTURES := 32
+  endif
+endif
+
+ARCH_NEG_FILTER += 20 21
+
+include $(ROOTDIR)/thrust/internal/build/common_detect.mk
+include $(ROOTDIR)/thrust/internal/build/common_build.mk
diff --git a/internal/benchmark/combine_benchmark_results.py b/internal/benchmark/combine_benchmark_results.py
new file mode 100755
index 000000000..f82b21f80
--- /dev/null
+++ b/internal/benchmark/combine_benchmark_results.py
@@ -0,0 +1,817 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2012-7 Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+###############################################################################
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# XXX Put code shared with `compare_benchmark_results.py` in a common place.
+
+# XXX Relative uncertainty.
+
+from sys import exit, stdout
+
+from os.path import splitext
+
+from itertools import imap # Lazy map.
+
+from math import sqrt, log10, floor
+
+from collections import deque
+
+from argparse import ArgumentParser as argument_parser
+
+from csv import DictReader as csv_dict_reader
+from csv import DictWriter as csv_dict_writer
+
+from re import compile as regex_compile
+
+###############################################################################
+
+def unpack_tuple(f):
+  """Return a unary function that calls `f` with its argument unpacked."""
+  return lambda args: f(*iter(args))
+
+def strip_dict(d):
+  """Strip leading and trailing whitespace from all keys and values in `d`."""
+  d.update({key: value.strip() for (key, value) in d.items()})
+
+def merge_dicts(d0, d1):
+  """Create a new `dict` that is the union of `dict`s `d0` and `d1`."""
+  d = d0.copy()
+  d.update(d1)
+  return d
+
+def strip_list(l):
+  """Strip leading and trailing whitespace from all values in `l`."""
+  for i, value in enumerate(l): l[i] = value.strip()
+
+###############################################################################
+
+def int_or_float(x):
+  """Convert `x` to either `int` or `float`, preferring `int`.
+
+  Raises:
+    ValueError : If `x` is not convertible to either `int` or `float`
+  """
+  try:
+    return int(x)
+  except ValueError:
+    return float(x)
+
+def try_int_or_float(x):
+  """Try to convert `x` to either `int` or `float`, preferring `int`. `x` is
+  returned unmodified if conversion fails.
+  """
+  try:
+    return int_or_float(x)
+  except ValueError:
+    return x
+
+###############################################################################
+
+def find_significant_digit(x):
+  """Return the significant digit of the number x. The result is the number of
+  digits after the decimal place to round to (negative numbers indicate rounding
+  before the decimal place)."""
+  if x == 0: return 0
+  return -int(floor(log10(abs(x))))
+
+def round_with_int_conversion(x, ndigits = None):
+  """Rounds `x` to `ndigits` after the the decimal place. If `ndigits` is less
+  than 1, convert the result to `int`. If `ndigits` is `None`, the significant
+  digit of `x` is used."""
+  if ndigits is None: ndigits = find_significant_digit(x)
+  x_rounded = round(x, ndigits)
+  return int(x_rounded) if ndigits < 1 else x_rounded
+
+###############################################################################
+
+class measured_variable(object):
+  """A meta-variable representing measured data. It is composed of three raw
+  variables plus units meta-data.
+
+  Attributes:
+    quantity (`str`) :
+      Name of the quantity variable of this object.
+    uncertainty (`str`) :
+      Name of the uncertainty variable of this object.
+    sample_size (`str`) :
+      Name of the sample size variable of this object.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+class measured_value(object):
+  """An object that represents a value determined by multiple measurements.
+
+  Attributes:
+    quantity (scalar) :
+      The quantity of the value, e.g. the arithmetic mean.
+    uncertainty (scalar) :
+      The measurement uncertainty, e.g. the sample standard deviation.
+    sample_size (`int`) :
+      The number of observations contributing to the value.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size = 1, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+###############################################################################
+
+def arithmetic_mean(X):
+  """Computes the arithmetic mean of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+
+  .. math::
+
+    u = \frac{\sum_{i = 0}^{n - 1} X_i}{n}
+  """
+  return sum(X) / len(X)
+
+def sample_variance(X, u = None):
+  """Computes the sample variance of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    v = \frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  return sum(imap(lambda X_i: (X_i - u) ** 2, X)) / (len(X) - 1)
+ 
+def sample_standard_deviation(X, u = None, v = None):
+  """Computes the sample standard deviation of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+    v (number)     : The sample variance of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  if v is None: v = sample_variance(X, u)
+  return sqrt(v)
+
+def combine_sample_size(As):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+
+  .. math::
+
+    n = \sum{i = 0}^{g - 1} n_i
+  """
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i), As))
+
+def combine_arithmetic_mean(As, n = None):
+  """Computes the combined arithmetic mean of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+
+  .. math::
+
+    u = \frac{\sum{i = 0}^{g - 1} n_i u_i}{n}
+  """
+  if n is None: n = combine_sample_size(As)
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i * u_i), As)) / n
+  
+def combine_sample_variance(As, n = None, u = None):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+
+  .. math::
+
+    v = \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  return sum(imap(unpack_tuple(
+    lambda u_i, s_i, n_i, t_i: n_i * (u_i - u) ** 2 + (s_i ** 2) * (n_i - 1)
+  ), As)) / (n - 1)
+
+def combine_sample_standard_deviation(As, n = None, u = None, v = None):
+  """Computes the combined sample standard deviation of a group of
+  `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+    v (number)                           : The combined sample variance of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  if v is None: v = combine_sample_variance(As, n, u)
+  return sqrt(v)
+
+###############################################################################
+
+def process_program_arguments():
+  ap = argument_parser(
+    description = (
+      "Aggregates the results of multiple runs of benchmark results stored in "
+      "CSV format."
+    )
+  )
+
+  ap.add_argument(
+    "-d", "--dependent-variable",
+    help = ("Treat the specified three variables as a dependent variable. The "
+            "1st variable is the measured quantity, the 2nd is the uncertainty "
+            "of the measurement and the 3rd is the sample size. The defaults "
+            "are the dependent variables of Thrust's benchmark suite. May be "
+            "specified multiple times."),
+    action = "append", type = str, dest = "dependent_variables",
+    metavar = "QUANTITY,UNCERTAINTY,SAMPLES"
+  )
+
+  ap.add_argument(
+    "-p", "--preserve-whitespace",
+    help = ("Don't trim leading and trailing whitespace from each CSV cell."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "-o", "--output-file",
+    help = ("The file that results are written to. If `-`, results are "
+            "written to stdout."),
+    action = "store", type = str, default = "-",
+    metavar = "OUTPUT"
+  )
+
+  ap.add_argument(
+    "input_files",
+    help = ("Input CSV files. The first two rows should be a header. The 1st "
+            "header row specifies the name of each variable, and the 2nd "
+            "header row specifies the units for that variable."),
+    type = str, nargs = "+",
+    metavar = "INPUTS"
+  )
+
+  return ap.parse_args()
+
+###############################################################################
+
+def filter_comments(f, s = "#"):
+  """Return an iterator to the file `f` which filters out all lines beginning
+  with `s`."""
+  return filter(lambda line: not line.startswith(s), f)
+
+###############################################################################
+
+class io_manager(object):
+  """Manages I/O operations and represents the input data as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`. It can be used with `with`.
+
+  Attributes:
+    preserve_whitespace (`bool`) :
+      If `False`, leading and trailing whitespace is stripped from each CSV cell.
+    writer (`csv_dict_writer`) :
+      CSV writer object that the output is written to.
+    output_file (`file` or `stdout`) :
+      The output `file` object.
+    readers (`list` of `csv_dict_reader`s) :
+      List of input files as CSV reader objects.
+    input_files (list of `file`s) :
+      List of input `file` objects.
+    variable_names (`list` of `str`s) :
+      Names of the variables, in order. 
+    variable_units (`list` of `str`s) :
+      Units of the variables, in order. 
+  """
+
+  def __init__(self, input_files, output_file, preserve_whitespace = True):
+    """Read input files and open the output file and construct a new `io_manager`
+    object.
+
+    If `preserve_whitespace` is `False`, leading and trailing whitespace is
+    stripped from each CSV cell.
+
+    Raises
+      AssertionError :
+        If `len(input_files) <= 0` or `type(preserve_whitespace) != bool`.
+    """
+    assert len(input_files) > 0, "No input files provided."
+
+    assert type(preserve_whitespace) == bool
+
+    self.preserve_whitespace = preserve_whitespace
+
+    self.readers = deque()
+
+    self.variable_names = None
+    self.variable_units = None
+
+    self.input_files = deque()
+
+    for input_file in input_files:
+      input_file_object = open(input_file)
+      reader = csv_dict_reader(filter_comments(input_file_object))
+
+      if not self.preserve_whitespace:
+        strip_list(reader.fieldnames)
+
+      if self.variable_names is None:
+        self.variable_names = reader.fieldnames
+      else:
+        # Make sure all inputs have the same schema.
+        assert self.variable_names == reader.fieldnames,                      \
+          "Input file (`" + input_file + "`) variable schema `"             + \
+          str(reader.fieldnames) + "` does not match the variable schema `" + \
+          str(self.variable_names) + "`."
+
+      # Consume the next row, which should be the second line of the header.
+      variable_units = reader.next()
+
+      if not self.preserve_whitespace:
+        strip_dict(variable_units)
+
+      if self.variable_units is None:
+        self.variable_units = variable_units
+      else:
+        # Make sure all inputs have the same units schema.
+        assert self.variable_units == variable_units,                         \
+          "Input file (`" + input_file + "`) units schema `"                + \
+          str(variable_units) + "` does not match the units schema `"       + \
+          str(self.variable_units) + "`."
+
+      self.readers.append(reader)
+      self.input_files.append(input_file_object)
+ 
+    if   output_file == "-": # Output to stdout.
+      self.output_file = stdout
+    else:                    # Output to user-specified file.
+      self.output_file = open(output_file, "w")
+
+    self.writer = csv_dict_writer(
+      self.output_file, fieldnames = self.variable_names
+    )
+
+  def __enter__(self):
+    """Called upon entering a `with` statement."""
+    return self
+
+  def __exit__(self, *args):
+    """Called upon exiting a `with` statement."""
+    if   self.output_file is stdout:
+      self.output_file = None
+    elif self.output_file is not None:
+      self.output_file.__exit__(*args)
+
+    for input_file in self.input_files:
+      input_file.__exit__(*args)
+
+  #############################################################################
+  # Input Stream.
+
+  def __iter__(self):
+    """Return an iterator to the input sequence.
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def next(self):
+    """Consume and return the next record (a `dict` representing a CSV row) in
+    the input.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration : If there is no more input.
+    """
+    if len(self.readers) == 0:
+      raise StopIteration()
+
+    try:
+      row = self.readers[0].next()
+      if not self.preserve_whitespace: strip_dict(row)
+      return row
+    except StopIteration:
+      # The current reader is empty, so pop it, pop it's input file, close the
+      # input file, and then call ourselves again. 
+      self.readers.popleft()
+      self.input_files.popleft().close()
+      return self.next()
+
+  #############################################################################
+  # Output.
+
+  def write_header(self):
+    """Write the header for the output CSV file."""
+    # Write the first line of the header.
+    self.writer.writeheader()
+
+    # Write the second line of the header.
+    self.writer.writerow(self.variable_units)
+
+  def write(self, d):
+    """Write a record (a `dict`) to the output CSV file."""
+    self.writer.writerow(d)
+
+###############################################################################
+
+class dependent_variable_parser(object):
+  """Parses a `--dependent-variable=AVG,STDEV,TRIALS` command line argument."""
+
+  #############################################################################
+  # Grammar
+
+  # Parse a variable_name.
+  variable_name_rule = r'[^,]+'
+
+  # Parse a variable classification.        
+  dependent_variable_rule = r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'
+
+  engine = regex_compile(dependent_variable_rule)
+
+  #############################################################################
+
+  def __call__(self, s):
+    """Parses the string `s` with the form "AVG,STDEV,TRIALS".
+
+    Returns:
+      A `measured_variable`. 
+
+    Raises:
+      AssertionError : If parsing fails.
+    """
+
+    match = self.engine.match(s)
+
+    assert match is not None,                                          \
+      "Dependent variable (-d) `" +s+ "` is invalid, the format is " + \
+      "`AVG,STDEV,TRIALS`."
+
+    return measured_variable(match.group(1), match.group(2), match.group(3))
+
+###############################################################################
+
+class record_aggregator(object):
+  """Consumes and combines records and represents the result as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`.
+
+  Attributes:
+    dependent_variables (`list` of `measured_variable`s) :
+      A list of dependent variables provided on the command line.
+    dataset (`dict`) :
+      A mapping of distinguishing (e.g. control + independent) values (`tuple`s
+      of variable-quantity pairs) to `list`s of dependent values (`dict`s from 
+      variables to lists of cells).
+    in_order_dataset_keys :
+      A list of unique dataset keys (e.g. distinguishing variables) in order of
+      appearance.
+  """
+
+  parse_dependent_variable = dependent_variable_parser()
+
+  def __init__(self, raw_dependent_variables):
+    """Parse dependent variables and construct a new `record_aggregator` object.
+
+    Raises:
+      AssertionError : If parsing of dependent variables fails.
+    """
+    self.dependent_variables = []
+
+    if raw_dependent_variables is not None:
+      for variable in raw_dependent_variables:
+        self.dependent_variables.append(self.parse_dependent_variable(variable))
+
+    self.dataset = {}
+
+    self.in_order_dataset_keys = deque()
+
+  #############################################################################
+  # Insertion.
+
+  def append(self, record):
+    """Add `record` to the dataset.
+
+    Raises:
+      ValueError : If any `str`-to-numeric conversions fail.
+    """
+    # The distinguishing variables are the control and independent variables.
+    # They form the key for each record in the dataset. Records with the same
+    # distinguishing variables are treated as observations of the same data
+    # point.
+    dependent_values = {}
+
+    # To allow the same sample size variable to be used for multiple dependent
+    # variables, we don't pop sample size variables until we're done processing
+    # all variables.
+    sample_size_variables = []
+
+    # Separate the dependent values from the distinguishing variables and
+    # perform `str`-to-numeric conversions.
+    for variable in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = variable.as_tuple()
+
+      dependent_values[quantity]    = [int_or_float(record.pop(quantity))]
+      dependent_values[uncertainty] = [int_or_float(record.pop(uncertainty))]
+      dependent_values[sample_size] = [int(record[sample_size])]
+
+      sample_size_variables.append(sample_size)
+
+    # Pop sample size variables.
+    for sample_size_variable in sample_size_variables:
+      # Allowed to fail, as we may have duplicates.
+      record.pop(sample_size_variable, None)
+
+    # `dict`s aren't hashable, so create a tuple of key-value pairs.
+    distinguishing_values = tuple(record.items())
+
+    if distinguishing_values in self.dataset:
+      # These distinguishing values already exist, so get the `dict` they're
+      # mapped to, look up each key in `dependent_values` in the `dict`, and
+      # add the corresponding quantity in `dependent_values` to the list in the
+      # the `dict`.
+      for variable, columns in dependent_values.iteritems():
+        self.dataset[distinguishing_values][variable] += columns
+    else:
+      # These distinguishing values aren't in the dataset, so add them and
+      # record them in `in_order_dataset_keys`.
+      self.dataset[distinguishing_values] = dependent_values
+      self.in_order_dataset_keys.append(distinguishing_values)
+
+  #############################################################################
+  # Postprocessing.
+
+  def combine_dependent_values(self, dependent_values):
+    """Takes a mapping of dependent variables to lists of cells and returns
+    a new mapping with the cells combined.
+
+    Raises:
+      AssertionError : If class invariants were violated.
+    """
+    combined_dependent_values = dependent_values.copy()
+
+    for variable in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = variable.as_tuple()
+
+      quantities    = dependent_values[quantity]
+      uncertainties = dependent_values[uncertainty]
+      sample_sizes  = dependent_values[sample_size]
+
+      if type(sample_size) is list:
+        # Sample size hasn't been combined yet.
+        assert len(quantities)    == len(uncertainties)                       \
+           and len(uncertainties) == len(sample_sizes),                       \
+          "Length of quantities list `(" + str(len(quantities)) + ")`, "    + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          "),` and length of sample sizes list `(" + str(len(sample_sizes)) + \
+          ")` are not the same."
+      else:
+        # Another dependent variable that uses our sample size has combined it
+        # already.
+        assert len(quantities) == len(uncertainties),                         \
+          "Length of quantities list `(" + str(len(quantities)) + ")` and " + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          ")` are not the same."
+
+      # Convert the three separate `list`s into one list of `measured_value`s.
+      measured_values = []
+
+      for i in range(len(quantities)):
+        mv = measured_value(
+          quantities[i], uncertainties[i], sample_sizes[i], units
+        )
+
+        measured_values.append(mv)
+
+      # Combine the `measured_value`s.
+      combined_sample_size = combine_sample_size(
+        measured_values
+      )
+
+      combined_arithmetic_mean = combine_arithmetic_mean(
+        measured_values, combined_sample_size
+      )
+
+      combined_sample_standard_deviation = combine_sample_standard_deviation(
+        measured_values, combined_sample_size, combined_arithmetic_mean
+      )
+
+      # Round the quantity and uncertainty to the significant digit of
+      # uncertainty and insert the combined values into the results.
+      sigdig = find_significant_digit(combined_sample_standard_deviation)
+
+#      combined_arithmetic_mean = round_with_int_conversion(
+#        combined_arithmetic_mean, sigdig
+#      )
+
+#      combined_sample_standard_deviation = round_with_int_conversion(
+#        combined_sample_standard_deviation, sigdig
+#      )
+
+      combined_dependent_values[quantity]    = combined_arithmetic_mean
+      combined_dependent_values[uncertainty] = combined_sample_standard_deviation
+      combined_dependent_values[sample_size] = combined_sample_size
+
+    return combined_dependent_values
+
+  ############################################################################# 
+  # Output Stream.
+
+  def __iter__(self):
+    """Return an iterator to the output sequence of separated distinguishing
+    variables and dependent variables (a tuple of two `dict`s).
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def records(self):
+    """Return an iterator to the output sequence of CSV rows (`dict`s of
+    variables to values).
+    """
+    return imap(unpack_tuple(lambda dist, dep: merge_dicts(dist, dep)), self)
+
+  def next(self):
+    """Produce the components of the next output record - a tuple of two
+    `dict`s. The first `dict` is a mapping of distinguishing variables to
+    distinguishing values, the second `dict` is a mapping of dependent
+    variables to combined dependent values. Combining the two dicts forms a
+    CSV row suitable for output.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration  : If there is no more output.
+      AssertionError : If class invariants were violated.
+    """
+    assert len(self.dataset.keys()) == len(self.in_order_dataset_keys),      \
+      "Number of dataset keys (`" + str(len(self.dataset.keys()))          + \
+      "`) is not equal to the number of keys in the ordering list (`"      + \
+      str(len(self.in_order_dataset_keys)) + "`)."
+
+    if len(self.in_order_dataset_keys) == 0:
+      raise StopIteration()
+
+    # Get the next set of distinguishing values and convert them to a `dict`.
+    raw_distinguishing_values = self.in_order_dataset_keys.popleft()
+    distinguishing_values     = dict(raw_distinguishing_values)
+
+    dependent_values = self.dataset.pop(raw_distinguishing_values)
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return (distinguishing_values, combined_dependent_values)
+
+###############################################################################
+
+args = process_program_arguments()
+
+if args.dependent_variables is None:
+  args.dependent_variables = [
+    "STL Average Walltime,STL Walltime Uncertainty,STL Trials",
+    "STL Average Throughput,STL Throughput Uncertainty,STL Trials",
+    "Thrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials",
+    "Thrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"
+  ]
+
+# Read input files and open the output file.
+with io_manager(args.input_files,
+                args.output_file,
+                args.preserve_whitespace) as iom:
+  # Parse dependent variable options.
+  ra = record_aggregator(args.dependent_variables)
+
+  # Add all input data to the `record_aggregator`.
+  for record in iom:
+    ra.append(record)
+
+  iom.write_header()
+
+  # Write combined results out.
+  for record in ra.records():
+    iom.write(record)
+
diff --git a/internal/benchmark/compare_benchmark_results.py b/internal/benchmark/compare_benchmark_results.py
new file mode 100755
index 000000000..22e7be8cf
--- /dev/null
+++ b/internal/benchmark/compare_benchmark_results.py
@@ -0,0 +1,1308 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2012-7 Bryce Adelstein Lelbach aka wash <brycelelbach@gmail.com>
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+###############################################################################
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+# XXX Put code shared with `combine_benchmark_results.py` in a common place.
+
+# XXX Relative uncertainty.
+
+# XXX Create uncertain value class which is quantity + uncertainty.
+
+from sys import exit, stdout
+
+from os.path import splitext
+
+from itertools import imap # Lazy map.
+
+from math import sqrt, log10, floor
+
+from collections import deque
+
+from argparse import ArgumentParser as argument_parser
+from argparse import Action as argument_action
+
+from csv import DictReader as csv_dict_reader
+from csv import DictWriter as csv_dict_writer
+
+from re import compile as regex_compile
+
+###############################################################################
+
+def unpack_tuple(f):
+  """Return a unary function that calls `f` with its argument unpacked."""
+  return lambda args: f(*iter(args))
+
+def strip_dict(d):
+  """Strip leading and trailing whitespace from all keys and values in `d`.
+
+  Returns:
+    The modified dict `d`.
+  """
+  d.update({key: value.strip() for (key, value) in d.items()})
+  return d
+
+def merge_dicts(d0, d1):
+  """Create a new `dict` that is the union of `dict`s `d0` and `d1`."""
+  d = d0.copy()
+  d.update(d1)
+  return d
+
+def change_key_in_dict(d, old_key, new_key):
+  """Change the key of the entry in `d` with key `old_key` to `new_key`. If
+  there is an existing entry 
+
+  Returns:
+    The modified dict `d`.
+
+  Raises:
+    KeyError : If `old_key` is not in `d`.
+  """
+  d[new_key] = d.pop(old_key)
+  return d
+
+def key_from_dict(d):
+  """Create a hashable key from a `dict` by converting the `dict` to a tuple."""
+  return tuple(sorted(d.items()))
+
+def strip_list(l):
+  """Strip leading and trailing whitespace from all values in `l`."""
+  for i, value in enumerate(l): l[i] = value.strip()
+  return l
+
+def remove_from_list(l, item):
+  """Remove the first occurence of `item` from list `l` and return a tuple of
+  the index that was removed and the element that was removed.
+
+  Raises:
+    ValueError : If `item` is not in `l`.
+  """
+  idx = l.index(item)
+  item = l.pop(idx)
+  return (idx, item)
+
+###############################################################################
+
+def int_or_float(x):
+  """Convert `x` to either `int` or `float`, preferring `int`.
+
+  Raises:
+    ValueError : If `x` is not convertible to either `int` or `float`
+  """
+  try:
+    return int(x)
+  except ValueError:
+    return float(x)
+
+def try_int_or_float(x):
+  """Try to convert `x` to either `int` or `float`, preferring `int`. `x` is
+  returned unmodified if conversion fails.
+  """
+  try:
+    return int_or_float(x)
+  except ValueError:
+    return x
+
+###############################################################################
+
+def ranges_overlap(x1, x2, y1, y2):
+  """Returns true if the ranges `[x1, x2]` and `[y1, y2]` overlap,
+  where `x1 <= x2` and `y1 <= y2`.
+
+  Raises:
+    AssertionError : If `x1 > x2` or `y1 > y2`.
+  """
+  assert x1 <= x2
+  assert y1 <= y2
+  return x1 <= y2 and y1 <= x2
+
+def ranges_overlap_uncertainty(x, x_unc, y, y_unc):
+  """Returns true if the ranges `[x - x_unc, x + x_unc]` and
+  `[y - y_unc, y + y_unc]` overlap, where `x_unc >= 0` and `y_unc >= 0`.
+
+  Raises:
+    AssertionError : If `x_unc < 0` or `y_unc < 0`.
+  """
+  assert x_unc >= 0
+  assert y_unc >= 0
+  return ranges_overlap(x - x_unc, x + x_unc, y - y_unc, y + y_unc)
+
+###############################################################################
+
+# Formulas for propagation of uncertainty from:
+#
+#   https://en.wikipedia.org/wiki/Propagation_of_uncertainty#Example_formulas
+#
+# Even though it's Wikipedia, I trust it as I helped write that table.
+#
+# XXX Replace with a proper reference.
+
+def uncertainty_multiplicative(f, A, A_abs_unc, B, B_abs_unc):
+  """Compute the propagated uncertainty from the multiplication of two
+  uncertain values, `A +/- A_abs_unc` and `B +/- B_abs_unc`. Given `f = AB` or
+  `f = A/B`, where `A != 0` and `B != 0`, the uncertainty in `f` is
+  approximately:
+
+  .. math::
+
+    \sigma_f = |f| \sqrt{\frac{\sigma_A}{A} ^ 2 + \frac{\sigma_B}{B} ^ 2}
+
+  Raises:
+    ZeroDivisionError : If `A == 0` or `B == 0`.
+  """
+  return abs(f) * sqrt((A_abs_unc / A) ** 2 + (B_abs_unc / B) ** 2);
+
+def uncertainty_additive(c, A_abs_unc, d, B_abs_unc):
+  """Compute the propagated uncertainty from addition of two uncertain values,
+  `A +/- A_abs_unc` and `B +/- B_abs_unc`. Given `f = cA + dB`, where `c` and
+  `d` are certain constants, the uncertainty in `f` is approximately:
+
+  .. math::
+
+    f_{\sigma} = \sqrt{c ^ 2 * A_{\sigma} ^ 2 + d ^ 2 * B_{\sigma} ^ 2}
+  """
+  return sqrt(((c ** 2) * (A_abs_unc ** 2)) + ((d ** 2) * (B_abs_unc ** 2)))
+
+###############################################################################
+
+# XXX Create change class.
+
+def absolute_change(old, new):
+  """Computes the absolute change from old to new:
+
+  .. math::
+
+    absolute_change = new - old
+  """
+  return new - old
+
+def absolute_change_uncertainty(old, old_unc, new, new_unc):
+  """Computes the uncertainty in the absolute change from old to new and returns
+  a tuple of the absolute change and the absolute change uncertainty.
+  """
+  absolute_change     = new - old
+  absolute_change_unc = uncertainty_additive(1.0, new_unc, -1.0, old_unc)
+
+  return (absolute_change, absolute_change_unc)
+
+def percent_change(old, new):
+  """Computes the percent change from old to new:
+
+  .. math::
+
+    percent_change = 100 \frac{new - old}{abs(old)}
+  """
+  return float(new - old) / abs(old)
+
+def percent_change_uncertainty(old, old_unc, new, new_unc):
+  """Computes the uncertainty in the percent change from old to new and returns
+  a tuple of the absolute change, the absolute change uncertainty, the percent
+  change and the percent change uncertainty.
+  """
+  # Let's break this down into a few sub-operations:
+  # 
+  #   absolute_change = new - old         <- Additive propagation.
+  #   relative_change = change / abs(old) <- Multiplicative propagation.
+  #   percent_change  = 100 * y           <- Multiplicative propagation.
+
+  if old == 0:
+    # We can't compute relative change because the old value is 0.
+    return (float("nan"), float("nan"), float("nan"), float("nan"))
+
+  (absolute_change, absolute_change_unc) = absolute_change_uncertainty(
+    old, old_unc, new, new_unc
+  )
+
+  if absolute_change == 0:
+    # We can't compute relative change uncertainty because the relative
+    # uncertainty of a value of 0 is undefined.
+    return (absolute_change, absolute_change_unc, float("nan"), float("nan"))
+
+  relative_change     = float(absolute_change) / abs(old)
+  relative_change_unc = uncertainty_multiplicative(
+    relative_change, absolute_change, absolute_change_unc, old, old_unc
+  )
+
+  percent_change = 100.0 * relative_change
+  percent_change_unc = uncertainty_multiplicative(
+    percent_change, 100.0, 0.0, relative_change, relative_change_unc
+  )
+
+  return (
+    absolute_change, absolute_change_unc, percent_change, percent_change_unc
+  )
+
+###############################################################################
+
+def find_significant_digit(x):
+  """Return the significant digit of the number x. The result is the number of
+  digits after the decimal place to round to (negative numbers indicate rounding
+  before the decimal place)."""
+  if x == 0: return 0
+  return -int(floor(log10(abs(x))))
+
+def round_with_int_conversion(x, ndigits = None):
+  """Rounds `x` to `ndigits` after the the decimal place. If `ndigits` is less
+  than 1, convert the result to `int`. If `ndigits` is `None`, the significant
+  digit of `x` is used."""
+  if ndigits is None: ndigits = find_significant_digit(x)
+  x_rounded = round(x, ndigits)
+  return int(x_rounded) if ndigits < 1 else x_rounded
+
+###############################################################################
+
+class measured_variable(object):
+  """A meta-variable representing measured data. It is composed of three raw
+  variables plus units meta-data.
+
+  Attributes:
+    quantity (`str`) :
+      Name of the quantity variable of this object.
+    uncertainty (`str`) :
+      Name of the uncertainty variable of this object.
+    sample_size (`str`) :
+      Name of the sample size variable of this object.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+class measured_value(object):
+  """An object that represents a value determined by multiple measurements.
+
+  Attributes:
+    quantity (scalar) :
+      The quantity of the value, e.g. the arithmetic mean.
+    uncertainty (scalar) :
+      The measurement uncertainty, e.g. the sample standard deviation.
+    sample_size (`int`) :
+      The number of observations contributing to the value.
+    units (units class or `None`) :
+      The units the value is measured in.
+  """
+
+  def __init__(self, quantity, uncertainty, sample_size = 1, units = None):
+    self.quantity    = quantity
+    self.uncertainty = uncertainty
+    self.sample_size = sample_size
+    self.units       = units
+
+  def as_tuple(self):
+    return (self.quantity, self.uncertainty, self.sample_size, self.units)
+
+  def __iter__(self):
+    return iter(self.as_tuple())
+
+  def __str__(self):
+    return str(self.as_tuple())
+
+  def __repr__(self):
+    return str(self)
+
+###############################################################################
+
+def arithmetic_mean(X):
+  """Computes the arithmetic mean of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+
+  .. math::
+
+    u = \frac{\sum_{i = 0}^{n - 1} X_i}{n}
+  """
+  return sum(X) / len(X)
+
+def sample_variance(X, u = None):
+  """Computes the sample variance of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    v = \frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  return sum(imap(lambda X_i: (X_i - u) ** 2, X)) / (len(X) - 1)
+ 
+def sample_standard_deviation(X, u = None, v = None):
+  """Computes the sample standard deviation of the sequence `X`.
+
+  Let:
+
+    * `n = len(X)`.
+    * `u` denote the arithmetic mean of `X`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+
+    s &= \sqrt{v}
+      &= \sqrt{\frac{\sum_{i = 0}^{n - 1} (X_i - u)^2}{n - 1}}
+
+  Args:
+    X (`Iterable`) : The sequence of values.
+    u (number)     : The arithmetic mean of `X`.
+    v (number)     : The sample variance of `X`.
+  """
+  if u is None: u = arithmetic_mean(X)
+  if v is None: v = sample_variance(X, u)
+  return sqrt(v)
+
+def combine_sample_size(As):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+
+  .. math::
+
+    n = \sum{i = 0}^{g - 1} n_i
+  """
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i), As))
+
+def combine_arithmetic_mean(As, n = None):
+  """Computes the combined arithmetic mean of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+
+  .. math::
+
+    u = \frac{\sum{i = 0}^{g - 1} n_i u_i}{n}
+  """
+  if n is None: n = combine_sample_size(As)
+  return sum(imap(unpack_tuple(lambda u_i, s_i, n_i, t_i: n_i * u_i), As)) / n
+  
+def combine_sample_variance(As, n = None, u = None):
+  """Computes the combined sample variance of a group of `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+
+  .. math::
+
+    v = \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  return sum(imap(unpack_tuple(
+    lambda u_i, s_i, n_i, t_i: n_i * (u_i - u) ** 2 + (s_i ** 2) * (n_i - 1)
+  ), As)) / (n - 1)
+
+def combine_sample_standard_deviation(As, n = None, u = None, v = None):
+  """Computes the combined sample standard deviation of a group of
+  `measured_value`s.
+
+  Let:
+
+    * `g = len(As)`.
+    * `u_i = As[i].quantity`.
+    * `s_i = As[i].uncertainty`.
+    * `n_i = As[i].samples`.
+    * `n` denote the combined sample size of `As`.
+    * `u` denote the arithmetic mean of the quantities of `As`.
+    * `v` denote the sample variance of `X`.
+    * `s` denote the sample standard deviation of `X`.
+
+  .. math::
+    v &= \frac{(\sum_{i = 0}^{g - 1} n_i (u_i - u)^2 + s_i^2 (n_i - 1))}{n - 1}
+
+    s &= \sqrt{v}
+
+  Args:
+    As (`Iterable` of `measured_value`s) : The sequence of values.
+    n (number)                           : The combined sample sizes of `As`.
+    u (number)                           : The combined arithmetic mean of `As`.
+    v (number)                           : The combined sample variance of `As`.
+  """
+  if n <= 1: return 0
+  if n is None: n = combine_sample_size(As)
+  if u is None: u = combine_arithmetic_mean(As, n)
+  if v is None: v = combine_sample_variance(As, n, u)
+  return sqrt(v)
+
+###############################################################################
+
+def store_const_multiple(const, *destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `const`."""
+  class store_const_multiple_action(argument_action):
+    def __init__(self, *args, **kwargs):
+      super(store_const_multiple_action, self).__init__(
+        metavar = None, nargs = 0, const = const, *args, **kwargs
+      )
+
+    def __call__(self, parser, namespace, values, option_string = None):
+      for destination in destinations:
+        setattr(namespace, destination, const)
+
+  return store_const_multiple_action
+
+def store_true_multiple(*destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `True`."""
+  return store_const_multiple(True, *destinations)
+
+def store_false_multiple(*destinations):
+  """Returns an `argument_action` class that sets multiple argument
+  destinations (`destinations`) to `False`."""
+  return store_const_multiple(False, *destinations)
+
+###############################################################################
+
+def process_program_arguments():
+  ap = argument_parser(
+    description = (
+      "Compares two sets of combined performance results and identifies "
+      "statistically significant changes."
+    )
+  )
+
+  ap.add_argument(
+    "baseline_input_file",
+    help = ("CSV file containing the baseline performance results. The first "
+            "two rows should be a header. The 1st header row specifies the "
+            "name of each variable, and the 2nd header row specifies the units "
+            "for that variable. The baseline results may be a superset of the "
+            "observed performance results, but the reverse is not true. The "
+            "baseline results must contain data for every datapoint in the "
+            "observed performance results."),            
+    type = str
+  )
+
+  ap.add_argument(
+    "observed_input_file",
+    help = ("CSV file containing the observed performance results. The first "
+            "two rows should be a header. The 1st header row specifies the name "
+            "of header row specifies the units for that variable."),
+    type = str
+  )
+
+  ap.add_argument(
+    "-o", "--output-file",
+    help = ("The file that results are written to. If `-`, results are "
+            "written to stdout."),
+    action = "store", type = str, default = "-",
+    metavar = "OUTPUT"
+  )
+
+  ap.add_argument(
+    "-c", "--control-variable",
+    help = ("Treat the specified variable as a control variable. This means "
+            "it will be filtered out when forming dataset keys. For example, "
+            "this could be used to ignore a timestamp variable that is "
+            "different in the baseline and observed results. May be specified "
+            "multiple times."),
+    action = "append", type = str, dest = "control_variables", default = [],
+    metavar = "QUANTITY"
+  )
+
+  ap.add_argument(
+    "-d", "--dependent-variable",
+    help = ("Treat the specified three variables as a dependent variable. The "
+            "1st variable is the measured quantity, the 2nd is the uncertainty "
+            "of the measurement and the 3rd is the sample size. The defaults "
+            "are the dependent variables of Thrust's benchmark suite. May be "
+            "specified multiple times."),
+    action = "append", type = str, dest = "dependent_variables", default = [],
+    metavar = "QUANTITY,UNCERTAINTY,SAMPLES"
+  )
+
+  ap.add_argument(
+    "-t", "--change-threshold",
+    help = ("Treat relative changes less than this amount (a percentage) as "
+            "statistically insignificant. The default is 5%%."),
+    action = "store", type = float, default = 5,
+    metavar = "PERCENTAGE"
+  )
+
+  ap.add_argument(
+    "-p", "--preserve-whitespace",
+    help = ("Don't trim leading and trailing whitespace from each CSV cell."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "--output-all-variables",
+    help = ("Don't omit original absolute values in output."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "--output-all-datapoints",
+    help = ("Don't omit datapoints that are statistically indistinguishable "
+            "in output."),
+    action = "store_true", default = False
+  )
+
+  ap.add_argument(
+    "-a", "--output-all",
+    help = ("Equivalent to `--output-all-variables --output-all-datapoints`."),
+    action = store_true_multiple("output_all_variables", "output_all_datapoints")
+  )
+
+  return ap.parse_args()
+
+###############################################################################
+
+def filter_comments(f, s = "#"):
+  """Return an iterator to the file `f` which filters out all lines beginning
+  with `s`."""
+  return filter(lambda line: not line.startswith(s), f)
+
+###############################################################################
+
+class io_manager(object):
+  """Manages I/O operations and represents the input data as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`. It can be used with `with`.
+
+  Attributes:
+    preserve_whitespace (`bool`) :
+      If `False`, leading and trailing whitespace is stripped from each CSV cell.
+    writer (`csv_dict_writer`) :
+      CSV writer object that the output is written to.
+    output_file (`file` or `stdout`) :
+      The output `file` object.
+    baseline_reader (`csv_dict_reader`) :
+      CSV reader object for the baseline results.
+    observed_reader (`csv_dict_reader`) :
+      CSV reader object for the observed results.
+    baseline_input_file (`file`) :
+      `file` object for the baseline results.
+    observed_input_file (`file`) :
+      `file` object for the observed results..
+    variable_names (`list` of `str`s) :
+      Names of the variables, in order. 
+    variable_units (`list` of `str`s) :
+      Units of the variables, in order. 
+  """
+
+  def __init__(self,
+               baseline_input_file, observed_input_file,
+               output_file,
+               preserve_whitespace = False):
+    """Read input files and open the output file and construct a new `io_manager`
+    object.
+
+    If `preserve_whitespace` is `False`, leading and trailing whitespace is
+    stripped from each CSV cell.
+
+    Raises
+      AssertionError :
+        If `type(preserve_whitespace) != bool`.
+    """
+    assert type(preserve_whitespace) == bool
+
+    self.preserve_whitespace = preserve_whitespace
+
+    # Open baseline results.
+    self.baseline_input_file = open(baseline_input_file)
+    self.baseline_reader = csv_dict_reader(
+      filter_comments(self.baseline_input_file)
+    )
+
+    if not self.preserve_whitespace:
+      strip_list(self.baseline_reader.fieldnames)
+
+    self.variable_names = list(self.baseline_reader.fieldnames) # Copy.
+    self.variable_units = self.baseline_reader.next()
+
+    if not self.preserve_whitespace:
+      strip_dict(self.variable_units)
+
+    # Open observed results.
+    self.observed_input_file = open(observed_input_file)
+    self.observed_reader = csv_dict_reader(
+      filter_comments(self.observed_input_file)
+    )
+
+    if not self.preserve_whitespace:
+      strip_list(self.observed_reader.fieldnames)
+
+    # Make sure all inputs have the same variables schema.
+    assert self.variable_names == self.observed_reader.fieldnames,             \
+      "Observed results input file (`" + observed_input_file + "`) "         + \
+      "variable schema `" + str(self.observed_reader.fieldnames) + "` does " + \
+      "not match the baseline results input file (`" + baseline_input_file   + \
+      "`) variable schema `" + str(self.variable_names) + "`."
+
+    # Consume the next row, which should be the second line of the header.
+    observed_variable_units = self.observed_reader.next()
+
+    if not self.preserve_whitespace:
+      strip_dict(observed_variable_units)
+
+    # Make sure all inputs have the same units schema.
+    assert self.variable_units == observed_variable_units,                    \
+      "Observed results input file (`" + observed_input_file + "`) "        + \
+      "units schema `" + str(observed_variable_units) + "` does not "       + \
+      "match the baseline results input file (`" + baseline_input_file      + \
+      "`) units schema `" + str(self.variable_units) + "`."
+
+    if   output_file == "-": # Output to stdout.
+      self.output_file = stdout
+    else:                    # Output to user-specified file.
+      self.output_file = open(output_file, "w")
+
+    self.writer = csv_dict_writer(
+      self.output_file, fieldnames = self.variable_names
+    )
+
+  def __enter__(self):
+    """Called upon entering a `with` statement."""
+    return self
+
+  def __exit__(self, *args):
+    """Called upon exiting a `with` statement."""
+    if   self.output_file is stdout:
+      self.output_file = None
+    elif self.output_file is not None:
+      self.output_file.__exit__(*args)
+
+    self.baseline_input_file.__exit__(*args)
+    self.observed_input_file.__exit__(*args)
+
+  def append_variable(self, name, units):
+    """Add a new variable to the output schema."""
+    self.variable_names.append(name)
+    self.variable_units.update({name : units})
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+  def insert_variable(self, idx, name, units):
+    """Insert a new variable into the output schema at index `idx`."""
+    self.variable_names.insert(idx, name)
+    self.variable_units.update({name : units})
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+  def remove_variable(self, name):
+    """Remove variable from the output schema and return a tuple of the variable
+    index and the variable units.
+
+    Raises:
+      ValueError : If `name` is not in the output schema.
+    """
+    # Remove the variable and get its index, which we'll need to remove the
+    # corresponding units entry.
+    (idx, item) = remove_from_list(self.variable_names, name)
+
+    # Remove the units entry.
+    units = self.variable_units.pop(item)
+
+    # Update CSV writer field names.
+    self.writer.fieldnames = self.variable_names
+
+    return (idx, units)
+
+  #############################################################################
+  # Input Stream.
+
+  def baseline(self):
+    """Return an iterator to the baseline results input sequence."""
+    return imap(lambda row: strip_dict(row), self.baseline_reader) 
+
+  def observed(self):
+    """Return an iterator to the observed results input sequence."""
+    return imap(lambda row: strip_dict(row), self.observed_reader) 
+
+  #############################################################################
+  # Output.
+
+  def write_header(self):
+    """Write the header for the output CSV file."""
+    # Write the first line of the header.
+    self.writer.writeheader()
+
+    # Write the second line of the header.
+    self.writer.writerow(self.variable_units)
+
+  def write(self, d):
+    """Write a record (a `dict`) to the output CSV file."""
+    self.writer.writerow(d)
+
+###############################################################################
+
+class dependent_variable_parser(object):
+  """Parses a `--dependent-variable=AVG,STDEV,TRIALS` command line argument."""
+
+  #############################################################################
+  # Grammar
+
+  # Parse a variable_name.
+  variable_name_rule = r'[^,]+'
+
+  # Parse a variable classification.        
+  dependent_variable_rule = r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'   \
+                          + r','                               \
+                          + r'(' + variable_name_rule + r')'
+
+  engine = regex_compile(dependent_variable_rule)
+
+  #############################################################################
+
+  def __call__(self, s):
+    """Parses the string `s` with the form "AVG,STDEV,TRIALS".
+
+    Returns:
+      A `measured_variable`. 
+
+    Raises:
+      AssertionError : If parsing fails.
+    """
+
+    match = self.engine.match(s)
+
+    assert match is not None,                                          \
+      "Dependent variable (-d) `" +s+ "` is invalid, the format is " + \
+      "`AVG,STDEV,TRIALS`."
+
+    return measured_variable(match.group(1), match.group(2), match.group(3))
+
+###############################################################################
+
+class record_aggregator(object):
+  """Consumes and combines records and represents the result as an `Iterable`
+  sequence of `dict`s.
+
+  It is `Iterable` and an `Iterator`.
+
+  Attributes:
+    dependent_variables (`list` of `measured_variable`s) :
+      A list of dependent variables provided on the command line.
+    control_variables (`list` of `str`s) :
+      A list of control variables provided on the command line.
+    dataset (`dict`) :
+      A mapping of distinguishing (e.g. control + independent) values (`tuple`s
+      of variable-quantity pairs) to `list`s of dependent values (`dict`s from 
+      variables to lists of cells).
+    in_order_dataset_keys :
+      A list of unique dataset keys (e.g. distinguishing variables) in order of
+      appearance.
+  """
+
+  def __init__(self, dependent_variables, control_variables):
+    """Construct a new `record_aggregator` object.
+
+    Raises:
+      AssertionError : If parsing of dependent variables fails.
+    """
+    self.dependent_variables = dependent_variables
+    self.control_variables = control_variables
+
+    self.dataset = {}
+
+    self.in_order_dataset_keys = deque()
+
+  #############################################################################
+  # Insertion.
+
+  def key_from_dict(self, d):
+    """Create a hashable key from a `dict` by filtering out control variables
+    and then converting the `dict` to a tuple.
+
+    Raises:
+      AssertionError : If any control variable was not found in `d`.
+    """
+    distinguishing_values = d.copy()
+
+    # Filter out control variables.
+    for var in self.control_variables:
+      distinguishing_values.pop(var, None)
+
+    return key_from_dict(distinguishing_values)
+
+  def append(self, record):
+    """Add `record` to the dataset.
+
+    Raises:
+      ValueError : If any `str`-to-numeric conversions fail.
+    """
+    # The distinguishing variables are the control and independent variables.
+    # They form the key for each record in the dataset. Records with the same
+    # distinguishing variables are treated as observations of the same
+    # datapoint.
+    dependent_values = {}
+
+    # To allow the same sample size variable to be used for multiple dependent
+    # variables, we don't pop sample size variables until we're done processing
+    # all variables.
+    sample_size_variables = []
+
+    # Separate the dependent values from the distinguishing variables and
+    # perform `str`-to-numeric conversions.
+    for var in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = var.as_tuple()
+
+      dependent_values[quantity]    = [int_or_float(record.pop(quantity))]
+      dependent_values[uncertainty] = [int_or_float(record.pop(uncertainty))]
+      dependent_values[sample_size] = [int(record[sample_size])]
+
+      sample_size_variables.append(sample_size)
+
+    # Pop sample size variables.
+    for var in sample_size_variables:
+      # Allowed to fail, as we may have duplicates.
+      record.pop(var, None)
+
+    distinguishing_values = self.key_from_dict(record)
+
+    if distinguishing_values in self.dataset:
+      # These distinguishing values already exist, so get the `dict` they're
+      # mapped to, look up each key in `dependent_values` in the `dict`, and
+      # add the corresponding quantity in `dependent_values` to the list in the
+      # the `dict`.
+      for var, columns in dependent_values.iteritems():
+        self.dataset[distinguishing_values][var] += columns
+    else:
+      # These distinguishing values aren't in the dataset, so add them and
+      # record them in `in_order_dataset_keys`.
+      self.dataset[distinguishing_values] = dependent_values
+      self.in_order_dataset_keys.append(distinguishing_values)
+
+  #############################################################################
+  # Postprocessing.
+
+  def combine_dependent_values(self, dependent_values):
+    """Takes a mapping of dependent variables to lists of cells and returns
+    a new mapping with the cells combined.
+
+    Raises:
+      AssertionError : If class invariants were violated.
+    """
+    combined_dependent_values = dependent_values.copy()
+
+    for var in self.dependent_variables:
+      quantity, uncertainty, sample_size, units = var.as_tuple()
+
+      quantities    = dependent_values[quantity]
+      uncertainties = dependent_values[uncertainty]
+      sample_sizes  = dependent_values[sample_size]
+
+      if type(sample_size) is list:
+        # Sample size hasn't been combined yet.
+        assert len(quantities)    == len(uncertainties)                       \
+           and len(uncertainties) == len(sample_sizes),                       \
+          "Length of quantities list `(" + str(len(quantities)) + ")`, "    + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          "),` and length of sample sizes list `(" + str(len(sample_sizes)) + \
+          ")` are not the same."
+      else:
+        # Another dependent variable that uses our sample size has combined it
+        # already.
+        assert len(quantities) == len(uncertainties),                         \
+          "Length of quantities list `(" + str(len(quantities)) + ")` and " + \
+          "length of uncertainties list `(" + str(len(uncertainties))       + \
+          ")` are not the same."
+
+      # Convert the three separate `list`s into one list of `measured_value`s.
+      measured_values = []
+
+      for i in range(len(quantities)):
+        mv = measured_value(
+          quantities[i], uncertainties[i], sample_sizes[i], units
+        )
+
+        measured_values.append(mv)
+
+      # Combine the `measured_value`s.
+      combined_sample_size = combine_sample_size(
+        measured_values
+      )
+
+      combined_arithmetic_mean = combine_arithmetic_mean(
+        measured_values, combined_sample_size
+      )
+
+      combined_sample_standard_deviation = combine_sample_standard_deviation(
+        measured_values, combined_sample_size, combined_arithmetic_mean
+      )
+
+      # Round the quantity and uncertainty to the significant digit of
+      # uncertainty and insert the combined values into the results.
+      sigdig = find_significant_digit(combined_sample_standard_deviation)
+
+#      combined_arithmetic_mean = round_with_int_conversion(
+#        combined_arithmetic_mean, sigdig
+#      )
+
+#      combined_sample_standard_deviation = round_with_int_conversion(
+#        combined_sample_standard_deviation, sigdig
+#      )
+
+      combined_dependent_values[quantity]    = combined_arithmetic_mean
+      combined_dependent_values[uncertainty] = combined_sample_standard_deviation
+      combined_dependent_values[sample_size] = combined_sample_size
+
+    return combined_dependent_values
+
+  ############################################################################# 
+  # Output Stream.
+
+  def __iter__(self):
+    """Return an iterator to the output sequence of separated distinguishing
+    variables and dependent variables (a tuple of two `dict`s).
+
+    This is a requirement for the `Iterable` protocol.
+    """
+    return self
+
+  def records(self):
+    """Return an iterator to the output sequence of CSV rows (`dict`s of
+    variables to values).
+    """
+    return imap(unpack_tuple(lambda dist, dep: merge_dicts(dist, dep)), self)
+
+  def next(self):
+    """Produce the components of the next output record - a tuple of two
+    `dict`s. The first `dict` is a mapping of distinguishing variables to
+    distinguishing values, the second `dict` is a mapping of dependent
+    variables to combined dependent values. Combining the two dicts forms a
+    CSV row suitable for output.
+
+    This is a requirement for the `Iterator` protocol.
+
+    Raises:
+      StopIteration  : If there is no more output.
+      AssertionError : If class invariants were violated.
+    """
+    assert len(self.dataset.keys()) == len(self.in_order_dataset_keys),      \
+      "Number of dataset keys (`" + str(len(self.dataset.keys()))          + \
+      "`) is not equal to the number of keys in the ordering list (`"      + \
+      str(len(self.in_order_dataset_keys)) + "`)."
+
+    if len(self.in_order_dataset_keys) == 0:
+      raise StopIteration()
+
+    # Get the next set of distinguishing values and convert them to a `dict`.
+    raw_distinguishing_values = self.in_order_dataset_keys.popleft()
+    distinguishing_values     = dict(raw_distinguishing_values)
+
+    dependent_values = self.dataset.pop(raw_distinguishing_values)
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return (distinguishing_values, combined_dependent_values)
+
+  def __getitem__(self, distinguishing_values):
+    """Produce the dependent component, a `dict` mapping dependent variables to
+    combined dependent values, associated with `distinguishing_values`.
+
+    Args:
+      distinguishing_values (`dict`) :
+        A `dict` mapping distinguishing variables to distinguishing values.
+
+    Raises:
+      KeyError : If `distinguishing_values` is not in the dataset.
+    """
+    raw_distinguishing_values = self.key_from_dict(distinguishing_values)
+
+    dependent_values = self.dataset[raw_distinguishing_values]
+
+    combined_dependent_values = self.combine_dependent_values(dependent_values)
+
+    return combined_dependent_values
+
+###############################################################################
+
+args = process_program_arguments()
+
+if len(args.dependent_variables) == 0:
+  args.dependent_variables = [
+    "STL Average Walltime,STL Walltime Uncertainty,STL Trials",
+    "STL Average Throughput,STL Throughput Uncertainty,STL Trials",
+    "Thrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials",
+    "Thrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"
+  ]
+
+# Parse dependent variable options.
+dependent_variables = []
+
+parse_dependent_variable = dependent_variable_parser()
+
+#if args.dependent_variables is not None:
+for var in args.dependent_variables:
+  dependent_variables.append(parse_dependent_variable(var))
+
+# Read input files and open the output file.
+with io_manager(args.baseline_input_file, 
+                args.observed_input_file,
+                args.output_file,
+                args.preserve_whitespace) as iom:
+
+  # Create record aggregators.
+  baseline_ra = record_aggregator(dependent_variables, args.control_variables)
+  observed_ra = record_aggregator(dependent_variables, args.control_variables)
+
+  # Duplicate dependent variables: one for baseline results, one for observed
+  # results.
+  baseline_suffix = " - `{0}`".format(
+    args.baseline_input_file
+  )
+  observed_suffix = " - `{0}`".format(
+    args.observed_input_file
+  )
+
+  for var in dependent_variables:
+    # Remove the existing quantity variable:
+    #
+    #   [ ..., a, b, c, ... ]
+    #             ^- remove b at index i
+    #
+    (quantity_idx, quantity_units) = iom.remove_variable(var.quantity)
+
+    # If the `--output-all-variables` option was specified, add the new baseline
+    # and observed quantity variables. Note that we insert in the reverse of
+    # the order we desire (which is baseline then observed):
+    #
+    #   [ ..., a, b_1, c, ... ]
+    #              ^- insert b_1 at index i
+    #
+    #   [ ..., a, b_0, b_1, c, ... ]
+    #              ^- insert b_0 at index i
+    #
+    if args.output_all_variables:
+      iom.insert_variable(
+        quantity_idx, var.quantity + observed_suffix, quantity_units
+      )
+      iom.insert_variable(
+        quantity_idx, var.quantity + baseline_suffix, quantity_units
+      )
+
+    # Remove the existing uncertainty variable.
+    (uncertainty_idx, uncertainty_units) = iom.remove_variable(var.uncertainty)
+
+    # If the `--output-all-variables` option was specified, add the new baseline
+    # and observed uncertainty variables.
+    if args.output_all_variables:
+      iom.insert_variable(
+        uncertainty_idx, var.uncertainty + observed_suffix, uncertainty_units
+      )
+      iom.insert_variable(
+        uncertainty_idx, var.uncertainty + baseline_suffix, uncertainty_units
+      )
+
+    try:
+      # Remove the existing sample size variable.
+      (sample_size_idx, sample_size_units) = iom.remove_variable(var.sample_size)
+
+      # If the `--output-all-variables` option was specified, add the new
+      # baseline and observed sample size variables.
+      if args.output_all_variables:
+        iom.insert_variable(
+          sample_size_idx, var.sample_size + observed_suffix, sample_size_units
+        )
+        iom.insert_variable(
+          sample_size_idx, var.sample_size + baseline_suffix, sample_size_units
+        )
+    except ValueError:
+      # This is alright, because dependent variables may share the same sample
+      # size variable.
+      pass
+
+  for var in args.control_variables:
+    iom.remove_variable(var)
+
+  # Add change variables.
+  absolute_change_suffix = " - Change (`{0}` - `{1}`)".format(
+    args.observed_input_file, args.baseline_input_file
+  )
+
+  percent_change_suffix = " - % Change (`{0}` to `{1}`)".format(
+    args.observed_input_file, args.baseline_input_file
+  )
+
+  for var in dependent_variables:
+    iom.append_variable(var.quantity + absolute_change_suffix, var.units)
+    iom.append_variable(var.uncertainty + absolute_change_suffix, var.units)
+    iom.append_variable(var.quantity + percent_change_suffix, "")
+    iom.append_variable(var.uncertainty + percent_change_suffix, "")
+
+  # Add all baseline input data to the `record_aggregator`.
+  for record in iom.baseline():
+    baseline_ra.append(record)
+  
+  for record in iom.observed():
+    observed_ra.append(record)
+
+  iom.write_header()
+
+  # Compare and output results.
+  for distinguishing_values, observed_dependent_values in observed_ra:
+    try:
+      baseline_dependent_values = baseline_ra[distinguishing_values]
+    except KeyError: 
+      assert False,                                                           \
+        "Distinguishing value `"                                            + \
+        str(baseline_ra.key_from_dict(distinguishing_values))               + \
+        "` was not found in the baseline results."
+
+    statistically_significant_change = False
+
+    record = distinguishing_values.copy()
+
+    # Compute changes, add the values and changes to the record, and identify
+    # changes that are statistically significant.
+    for var in dependent_variables:
+      # Compute changes.
+      baseline_quantity    = baseline_dependent_values[var.quantity]
+      baseline_uncertainty = baseline_dependent_values[var.uncertainty]
+      baseline_sample_size = baseline_dependent_values[var.sample_size]
+
+      observed_quantity    = observed_dependent_values[var.quantity]
+      observed_uncertainty = observed_dependent_values[var.uncertainty]
+      observed_sample_size = observed_dependent_values[var.sample_size]
+
+      (abs_change, abs_change_unc, per_change, per_change_unc) = \
+        percent_change_uncertainty(
+          baseline_quantity, baseline_uncertainty,
+          observed_quantity, observed_uncertainty
+        )
+
+      # Round the change quantities and uncertainties to the significant digit
+      # of uncertainty.
+      try:
+        abs_change_sigdig = max(
+          find_significant_digit(abs_change),
+          find_significant_digit(abs_change_unc),
+        )
+
+#        abs_change     = round_with_int_conversion(
+#          abs_change,     abs_change_sigdig
+#        )
+#        abs_change_unc = round_with_int_conversion(
+#          abs_change_unc, abs_change_sigdig
+#        )
+      except:
+        # Any value errors should be due to NaNs returned by
+        # `percent_change_uncertainty` because quantities or change in
+        # quantities was 0. We can ignore these.
+        pass
+
+      try:
+        per_change_sigdig = max(
+          find_significant_digit(per_change),
+          find_significant_digit(per_change_unc)
+        )
+
+#        per_change     = round_with_int_conversion(
+#          per_change,     per_change_sigdig
+#        )
+#        per_change_unc = round_with_int_conversion(
+#          per_change_unc, per_change_sigdig
+#        )
+      except:
+        # Any value errors should be due to NaNs returned by
+        # `percent_change_uncertainty` because quantities or change in
+        # quantities was 0. We can ignore these.
+        pass
+
+      # Add the values (if the `--output-all-variables` option was specified)
+      # and the changes to the record. Note that the record's schema is
+      # different from the original schema. If multiple dependent variables
+      # share the same sample size variable, it's fine - they will overwrite
+      # each other, but with the same value.
+      if args.output_all_variables:
+        record[var.quantity + baseline_suffix]         = baseline_quantity
+        record[var.uncertainty + baseline_suffix]      = baseline_uncertainty
+        record[var.sample_size + baseline_suffix]      = baseline_sample_size
+        record[var.quantity + observed_suffix]         = observed_quantity
+        record[var.uncertainty + observed_suffix]      = observed_uncertainty
+        record[var.sample_size + observed_suffix]      = observed_sample_size
+
+      record[var.quantity + absolute_change_suffix]    = abs_change
+      record[var.uncertainty + absolute_change_suffix] = abs_change_unc
+      record[var.quantity + percent_change_suffix]     = per_change
+      record[var.uncertainty + percent_change_suffix]  = per_change_unc
+
+      # If the range of uncertainties overlap don't overlap and the percentage
+      # change is greater than the change threshold, then change is
+      # statistically significant.
+      overlap = ranges_overlap_uncertainty(
+          baseline_quantity, baseline_uncertainty,
+          observed_quantity, observed_uncertainty
+      )
+      if not overlap and per_change >= args.change_threshold:
+        statistically_significant_change = True
+
+    # Print the record if a statistically significant change was found or if the
+    # `--output-all-datapoints` option was specified.
+    if args.output_all_datapoints or statistically_significant_change:
+      iom.write(record)
+
diff --git a/internal/benchmark/random.h b/internal/benchmark/random.h
new file mode 100644
index 000000000..719588771
--- /dev/null
+++ b/internal/benchmark/random.h
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+struct hash32
+{
+  __host__ __device__
+  unsigned int operator()(unsigned int h) const
+  {
+    h = ~h + (h << 15);
+    h =  h ^ (h >> 12);
+    h =  h + (h <<  2);
+    h =  h ^ (h >>  4);
+    h =  h + (h <<  3) + (h << 11);
+    h =  h ^ (h >> 16);
+    return h;
+  }
+};
+
+struct hash64
+{
+  __host__ __device__
+  unsigned long long operator()(unsigned long long h) const
+  {
+    h = ~h + (h << 21);
+    h =  h ^ (h >> 24);
+    h = (h + (h <<  3)) + (h << 8);
+    h =  h ^ (h >> 14);
+    h = (h + (h <<  2)) + (h << 4);
+    h =  h ^ (h >> 28);
+    h =  h + (h << 31);
+    return h;
+  }
+};
+
+struct hashtofloat
+{
+  __host__ __device__
+  float operator()(unsigned int h) const
+  {
+    return static_cast<float>(hash32()(h)) / 4294967296.0f;
+  }
+};
+
+struct hashtodouble
+{
+  __host__ __device__
+  double operator()(unsigned long long h) const
+  {
+    return static_cast<double>(hash64()(h)) / 18446744073709551616.0;
+  }
+};
+
+
+
+template <typename Vector, typename T>
+void _randomize(Vector& v, T)
+{
+    thrust::transform(thrust::counting_iterator<unsigned int>(0), 
+                      thrust::counting_iterator<unsigned int>(0) + v.size(),
+                      v.begin(),
+                      hash32());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, long long)
+{
+    thrust::transform(thrust::counting_iterator<unsigned long long>(0), 
+                      thrust::counting_iterator<unsigned long long>(0) + v.size(),
+                      v.begin(),
+                      hash64());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, float)
+{
+    thrust::transform(thrust::counting_iterator<unsigned int>(0), 
+                      thrust::counting_iterator<unsigned int>(0) + v.size(),
+                      v.begin(),
+                      hashtofloat());
+}
+
+template <typename Vector>
+void _randomize(Vector& v, double)
+{
+    thrust::transform(thrust::counting_iterator<unsigned long long>(0), 
+                      thrust::counting_iterator<unsigned long long>(0) + v.size(),
+                      v.begin(),
+                      hashtodouble());
+}
+
+// fill Vector with random values
+template <typename Vector>
+void randomize(Vector& v)
+{
+    _randomize(v, typename Vector::value_type());
+}
+
+
diff --git a/internal/benchmark/tbb_algos.h b/internal/benchmark/tbb_algos.h
new file mode 100644
index 000000000..a50a1cd2f
--- /dev/null
+++ b/internal/benchmark/tbb_algos.h
@@ -0,0 +1,195 @@
+#pragma once
+
+#include <tbb/parallel_reduce.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_scan.h>
+#include <tbb/parallel_sort.h>
+#include <tbb/task_scheduler_init.h>
+#include <tbb/tick_count.h>
+#include <tbb/tbb_thread.h>
+
+#include <cstdef> // For std::size_t.
+
+#include <cassert>
+
+template <typename T>
+struct NegateBody
+{ 
+  void operator()(T& x) const
+  {
+    x = -x;
+  }
+};
+
+template <typename Vector>
+struct ForBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v;
+
+public: 
+  ForBody(Vector& x) : v(x) {}    
+
+  void operator()(tbb::blocked_range<std::size_t> const& r) const
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      v[i] = -v[i];
+  }
+};
+
+template <typename Vector>
+struct ReduceBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v;
+
+public: 
+  T sum;  
+
+  ReduceBody(Vector& x) : v(x), sum(0) {}    
+
+  ReduceBody(ReduceBody& x, tbb::split) : v(x.v), sum(0) {}
+
+  void operator()(tbb::blocked_range<std::size_t> const& r)
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      sum += v[i];
+  }
+  
+  void join(ReduceBody const& x) { sum += x.sum; } 
+};
+
+template <typename Vector>
+struct ScanBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector& v; 
+
+public: 
+  T sum; 
+
+  ScanBody(Vector& x) : sum(0), v(x) {} 
+
+  ScanBody(ScanBody& x, tbb::split) : v(x.v), sum(0) {} 
+
+  template <typename Tag> 
+  void operator()(tbb::blocked_range<std::size_t> const& r, Tag)
+  {
+    T temp = sum; 
+    for (std::size_t i = r.begin(); i < r.end(); ++i)
+    { 
+      temp = temp + x[i]; 
+      if (Tag::is_final_scan()) 
+        x[i] = temp; 
+    }        
+    sum = temp; 
+  }
+
+  void assign(ScanBody const& x) { sum = x.sum; } 
+
+  T get_sum() const { return sum; } 
+
+  void reverse_join(ScanBody const& x) { sum = x.sum + sum;} 
+};
+
+template <typename Vector>
+struct CopyBody
+{ 
+  typedef typename Vector::value_type T;
+
+private:
+  Vector &v;
+  Vector &u;
+
+public: 
+  CopyBody(Vector& x, Vector& y) : v(x), u(y) {}    
+
+  void operator()(tbb::blocked_range<size_t> const& r) const
+  { 
+    for (std::size_t i = r.begin(); i != r.end(); ++i)  
+      v[i] = u[i];
+  }
+};
+
+template <typename Vector>
+typename Vector::value_type tbb_reduce(Vector& v)
+{
+  ReduceBody<Vector> body(v);
+  tbb::parallel_reduce(tbb::blocked_range<size_t>(0, v.size()), body);
+  return body.sum;
+}
+
+template <typename Vector>
+void tbb_sort(Vector& v)
+{
+  tbb::parallel_sort(v.begin(), v.end());
+}
+
+template <typename Vector>
+void tbb_transform(Vector& v)
+{
+  ForBody<Vector> body(v);
+  tbb::parallel_for(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+template <typename Vector>
+void tbb_scan(Vector& v)
+{
+  ScanBody<Vector> body(v);
+  tbb::parallel_scan(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+template <typename Vector>
+void tbb_copy(Vector& v, Vector& u)
+{
+  CopyBody<Vector> body(v, u);
+  tbb::parallel_for(tbb::blocked_range<size_t>(0, v.size()), body);
+}
+
+void test_tbb()
+{
+  std::size_t elements = 1 << 20;
+
+  std::vector<int> A(elements);
+  std::vector<int> B(elements);
+  std::vector<int> C(elements);
+  std::vector<int> D(elements);
+
+  randomize(A);
+  randomize(B);
+  assert(std::accumulate(A.begin(), A.end(), 0) == tbb_reduce(A));
+  
+  randomize(A);
+  randomize(B);
+  std::transform(A.begin(), A.end(), A.begin(), thrust::negate<int>());
+  tbb_transform(B);
+  assert(A == B);
+ 
+  randomize(A);
+  randomize(B);
+  std::partial_sum(A.begin(), A.end(), A.begin());
+  tbb_scan(B);
+  assert(A == B);
+
+  randomize(A);
+  randomize(B);
+  std::sort(A.begin(), A.end());
+  tbb_sort(B);
+  assert(A == B);
+
+  randomize(A);
+  randomize(B);
+  randomize(C);
+  randomize(D);
+  std::copy(A.begin(), A.end(), C.begin());
+  tbb_copy(B, D);
+  assert(A == B);
+  assert(C == D);
+}
+
diff --git a/internal/benchmark/timer.h b/internal/benchmark/timer.h
new file mode 100644
index 000000000..077ffa44c
--- /dev/null
+++ b/internal/benchmark/timer.h
@@ -0,0 +1,129 @@
+#pragma once
+
+#include <cassert>
+
+#  define CUDA_SAFE_CALL_NO_SYNC( call) do {                                 \
+    cudaError err = call;                                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+#  define CUDA_SAFE_CALL( call) do {                                         \
+    CUDA_SAFE_CALL_NO_SYNC(call);                                            \
+    cudaError err = cudaDeviceSynchronize();                                 \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "CUDA error in file '%s' in line %i : %s.\n",        \
+                __FILE__, __LINE__, cudaGetErrorString( err) );              \
+        exit(EXIT_FAILURE);                                                  \
+    } } while (0)
+
+class cuda_timer
+{
+    cudaEvent_t start_;
+    cudaEvent_t stop_;
+
+ public:
+    cuda_timer()
+    {
+        CUDA_SAFE_CALL(cudaEventCreate(&start_));
+        CUDA_SAFE_CALL(cudaEventCreate(&stop_));
+    }
+
+    ~cuda_timer()
+    {
+        CUDA_SAFE_CALL(cudaEventDestroy(start_));
+        CUDA_SAFE_CALL(cudaEventDestroy(stop_));
+    }
+
+    void start()
+    {
+        CUDA_SAFE_CALL(cudaEventRecord(start_, 0));
+    }
+
+    void stop()
+    {
+        CUDA_SAFE_CALL(cudaEventRecord(stop_, 0));
+        CUDA_SAFE_CALL(cudaEventSynchronize(stop_));
+    }
+
+    double milliseconds_elapsed()
+    {
+        float elapsed_time;
+        CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsed_time, start_, stop_));
+        return elapsed_time;
+    }
+
+    double seconds_elapsed()
+    {
+        return milliseconds_elapsed() / 1000.0;
+    }
+};
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+#include <windows.h>
+
+class steady_timer
+{
+    LARGE_INTEGER frequency_; // Cached to avoid system calls.
+    LARGE_INTEGER start_;
+    LARGE_INTEGER stop_;
+
+ public:
+    steady_timer() : start_(), stop_(), frequency_()
+    {
+        BOOL const r = QueryPerformanceFrequency(&frequency_);
+        assert(0 != r);
+    }
+
+    void start()
+    {
+        BOOL const r = QueryPerformanceCounter(&start_);
+        assert(0 != r);
+    }
+
+    void stop()
+    {
+        BOOL const r = QueryPerformanceCounter(&stop_);
+        assert(0 != r);
+    }
+
+    double seconds_elapsed()
+    {
+        return double(stop_.QuadPart - start_.QuadPart)
+             / double(frequency_.QuadPart);
+    }
+};
+#else
+#include <time.h>
+
+class steady_timer
+{
+    timespec start_;
+    timespec stop_;
+
+ public:
+    steady_timer() : start_(), stop_() {}
+
+    void start()
+    {
+        int const r = clock_gettime(CLOCK_MONOTONIC, &start_);
+        assert(0 == r);
+    }
+
+    void stop()
+    {
+        int const r = clock_gettime(CLOCK_MONOTONIC, &stop_);
+        assert(0 == r);
+    }
+
+    double seconds_elapsed()
+    {
+        return double(stop_.tv_sec  - start_.tv_sec)
+             + double(stop_.tv_nsec - start_.tv_nsec) * 1.0e-9;
+    }
+};
+#endif
+
+
diff --git a/internal/build/common_build.mk b/internal/build/common_build.mk
new file mode 100644
index 000000000..25cee6bb4
--- /dev/null
+++ b/internal/build/common_build.mk
@@ -0,0 +1,89 @@
+USE_NEW_PROJECT_MK := 1
+
+CCCL_ENABLE_DEPRECATIONS := 1
+
+ifeq ($(OS),Linux)
+  LIBRARIES += m
+endif
+
+include $(ROOTDIR)/thrust/internal/build/common_compiler.mk
+
+# Add /bigobj to Windows build flag to workaround building Thrust with debug
+ifeq ($(OS),win32)
+  CUDACC_FLAGS += -Xcompiler "/bigobj"
+endif
+
+# Add -mthumb for Linux on ARM to work around bug in arm cross compiler from p4
+ifeq ($(TARGET_ARCH),ARMv7)
+  ifneq ($(HOST_ARCH),ARMv7)
+    ifeq ($(THRUST_TEST),1)
+      CUDACC_FLAGS += -Xcompiler "-mthumb"
+    endif
+  endif
+endif
+
+# Make PGI statically link against its libraries.
+ifeq ($(OS),$(filter $(OS),Linux Darwin))
+  ifdef USEPGCXX
+    NVCC_LDFLAGS += -Xcompiler "-Bstatic_pgi"
+  endif
+endif
+ifeq ($(SRC_PATH),)
+  SRC_PATH:=$(dir $(BUILD_SRC))
+  BUILD_SRC:=$(notdir $(BUILD_SRC))
+endif
+
+BUILD_SRC_SUFFIX:=$(suffix $(BUILD_SRC))
+
+ifeq ($(BUILD_SRC_SUFFIX),.cu)
+  CU_FILES += $(BUILD_SRC)
+else ifeq ($(BUILD_SRC_SUFFIX),.cpp)
+  FILES += $(BUILD_SRC)
+endif
+
+ifndef BUILD_AGAINST_RELEASE
+  # CUDA includes
+  ifdef VULCAN
+    INCLUDES_ABSPATH += $(VULCAN_INSTALL_DIR)/cuda/include
+    INCLUDES_ABSPATH += $(VULCAN_INSTALL_DIR)/cuda/_internal/cudart
+  else
+    INCLUDES_ABSPATH += $(ROOTDIR)/cuda/inc
+    INCLUDES_ABSPATH += $(ROOTDIR)/cuda/tools/cudart
+  endif
+
+  # Thrust includes
+  ifdef VULCAN
+    INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/thrust
+  else
+    INCLUDES_ABSPATH += $(ROOTDIR)/thrust
+  endif
+
+  # CUB includes
+  ifdef VULCAN
+    INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/cub
+  else
+    INCLUDES_ABSPATH += $(ROOTDIR)/cub
+  endif
+else
+  # CUDA, CUB, and Thrust includes
+  INCLUDES_ABSPATH += $(GPGPU_COMPILER_EXPORT)/include
+
+  ifeq ($(TARGET_ARCH),ARMv7)
+    LIBDIRS_ABSPATH += $(GPGPU_COMPILER_EXPORT)/lib32
+  else
+    LIBDIRS_ABSPATH += $(GPGPU_COMPILER_EXPORT)/lib64
+  endif
+endif
+
+ifdef VULCAN
+  LIBDIRS_ABSPATH  += $(VULCAN_BUILD_DIR)/bin/$(VULCAN_ARCH)_$(VULCAN_OS)$(VULCAN_ABI)_$(VULCAN_BUILD)
+endif
+
+USES_CUDA_DRIVER_HEADERS := 1
+
+ifdef VULCAN_TOOLKIT_BASE
+  include $(VULCAN_TOOLKIT_BASE)/build/common.mk
+else
+  include $(ROOTDIR)/build/common.mk
+endif
+
diff --git a/internal/build/common_compiler.mk b/internal/build/common_compiler.mk
new file mode 100644
index 000000000..020159365
--- /dev/null
+++ b/internal/build/common_compiler.mk
@@ -0,0 +1,160 @@
+ifeq ($(OS),$(filter $(OS),Linux Darwin))
+  ifndef USEPGCXX
+    CUDACC_FLAGS += -Xcompiler "-Wall -Wextra -Werror"
+
+    ifdef USEXLC
+      CXX_STD := c++14
+
+      # GCC does not warn about unused parameters in uninstantiated
+      # template functions, but xlC does. This causes xlC to choke on the
+      # OMP backend, which is mostly #ifdef'd out when you aren't using it.
+      CUDACC_FLAGS += -Xcompiler "-Wno-unused-parameter"
+
+      # xlC is unreasonable about unused functions in a translation unit
+      # when this warning is enabled; this includes warning on most functions
+      # that are defined as static inline in cuda_fp16.h. Disable this warning
+      # entirely under xlC.
+      CUDACC_FLAGS += -Xcompiler "-Wno-unused-function"
+    else # GCC, ICC or Clang AKA the sane ones.
+      # XXX Enable -Wcast-align.
+      CUDACC_FLAGS += -Xcompiler "-Winit-self -Woverloaded-virtual -Wno-cast-align -Wcast-qual -Wno-long-long -Wno-variadic-macros -Wno-unused-function"
+
+      ifdef USE_CLANGLLVM
+        IS_CLANG := 1
+      endif
+
+      ifeq ($(ABITYPE), androideabi)
+        ifneq ($(findstring clang, $(BASE_COMPILER)),)
+          IS_CLANG := 1
+        endif
+      endif
+
+      ifeq ($(OS), Darwin)
+        IS_CLANG := 1
+      endif
+
+      ifdef IS_CLANG
+        CXX_STD := c++14
+
+        ifdef USE_CLANGLLVM
+          CLANG_VERSION = $(shell $(USE_CLANGLLVM) --version 2>/dev/null | head -1 | sed -e 's/.*\([0-9]\)\.\([0-9]\)\(\.[0-9]\).*/\1\2/g')
+        else
+          CLANG_VERSION = $(shell $(CCBIN) --version 2>/dev/null | head -1 | sed -e 's/.*\([0-9]\)\.\([0-9]\)\(\.[0-9]\).*/\1\2/g')
+        endif
+
+        # GCC does not warn about unused parameters in uninstantiated
+        # template functions, but Clang does. This causes Clang to choke on the
+        # OMP backend, which is mostly #ifdef'd out when you aren't using it.
+        CUDACC_FLAGS += -Xcompiler "-Wno-unused-parameter"
+
+        # -Wunneeded-internal-declaration misfires in the unit test framework
+        # on older versions of Clang.
+        CUDACC_FLAGS += -Xcompiler "-Wno-unneeded-internal-declaration"
+
+        ifeq ($(shell if test $(CLANG_VERSION) -ge 60; then echo true; fi),true)
+          # Clang complains about name mangling changes due to `noexcept`
+          # becoming part of the type system; we don't care.
+          CUDACC_FLAGS += -Xcompiler "-Wno-noexcept-type"
+        endif
+      else # GCC
+        ifdef CCBIN
+          CCBIN_ENVIRONMENT :=
+          ifeq ($(OS), QNX)
+            # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the
+            # environment.
+            CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET)
+          endif
+
+          # Newer versions of GCC only print the major number with the
+          # -dumpversion flag, but they print all three with -dumpfullversion.
+          GCC_VERSION = $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) -dumpfullversion 2>/dev/null | sed -e 's/\([0-9]\)\.\([0-9]\)\(\.[0-9]\)\?/\1\2/g')
+
+          ifeq ($(GCC_VERSION),)
+            # Older versions of GCC (~4.4 and older) seem to print three version
+            # numbers (major, minor and patch) with the -dumpversion flag; newer
+            # versions only print one or two numbers.
+            GCC_VERSION = $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) -dumpversion | sed -e 's/\([0-9]\)\.\([0-9]\)\(\.[0-9]\)\?/\1\2/g')
+          endif
+
+          ifeq ($(shell if test $(GCC_VERSION) -ge 50; then echo true; fi),true)
+            CXX_STD := c++14
+          else
+            CUDACC_FLAGS += -DTHRUST_IGNORE_DEPRECATED_CPP_DIALECT
+          endif
+
+          ifeq ($(shell if test $(GCC_VERSION) -ge 73; then echo true; fi),true)
+            # GCC 7.3 complains about name mangling changes due to `noexcept`
+            # becoming part of the type system; we don't care.
+            CUDACC_FLAGS += -Xcompiler "-Wno-noexcept-type"
+          endif
+          ifeq ($(shell if test $(GCC_VERSION) -ge 80; then echo true; fi),true)
+            # GCC 8.x has a new warning that tries to diagnose technical misuses of
+            # memcpy and memmove. We need to resolve it better than this, but for the
+            # time being, we'll downgrade it from an error to a warning.
+            CUDACC_FLAGS += -Xcompiler "-Wno-error=class-memaccess"
+          endif
+        else
+          $(error CCBIN is not defined.)
+        endif
+      endif
+    endif
+  else
+    CXX_STD := c++14
+  endif
+else ifeq ($(OS),win32)
+  CXX_STD := c++14
+
+  # XXX Enable /Wall
+  CUDACC_FLAGS += -Xcompiler "/WX"
+
+  # Disabled loss-of-data conversion warnings.
+  # XXX Re-enable.
+  CUDACC_FLAGS += -Xcompiler "/wd4244 /wd4267"
+
+  # Suppress numeric conversion-to-bool warnings.
+  # XXX Re-enable.
+  CUDACC_FLAGS += -Xcompiler "/wd4800"
+
+  # Disable warning about applying unary - to unsigned type.
+  CUDACC_FLAGS += -Xcompiler "/wd4146"
+
+  # Warning about declspec(allocator) on inappropriate function types
+  CUDACC_FLAGS += -Xcompiler "/wd4494"
+
+  # Allow tests to have lots and lots of sections in each translation unit:
+  CUDACC_FLAGS += -Xcompiler "/bigobj"
+endif
+
+# Promote all NVCC warnings into errors
+CUDACC_FLAGS += -Werror all-warnings
+
+# Print warning numbers with cudafe diagnostics
+CUDACC_FLAGS += -Xcudafe --display_error_number
+
+VERSION_FLAG :=
+ifeq ($(OS),$(filter $(OS),Linux Darwin))
+  ifdef USEPGCXX        # PGI
+    VERSION_FLAG := -V
+  else
+    ifdef USEXLC        # XLC
+      VERSION_FLAG := -qversion
+    else                # GCC, ICC or Clang AKA the sane ones.
+      VERSION_FLAG := --version
+    endif
+  endif
+else ifeq ($(OS),win32) # MSVC
+  # cl.exe run without any options will print its version info and exit.
+  VERSION_FLAG :=
+endif
+
+CCBIN_ENVIRONMENT :=
+ifeq ($(OS), QNX)
+  # QNX's GCC complains if QNX_HOST and QNX_TARGET aren't defined in the
+  # environment.
+  CCBIN_ENVIRONMENT := QNX_HOST=$(QNX_HOST) QNX_TARGET=$(QNX_TARGET)
+endif
+
+$(info #### CCBIN         : $(CCBIN))
+$(info #### CCBIN VERSION : $(shell $(CCBIN_ENVIRONMENT) $(CCBIN) $(VERSION_FLAG)))
+$(info #### CXX_STD       : $(CXX_STD))
+
diff --git a/internal/build/common_detect.mk b/internal/build/common_detect.mk
new file mode 100644
index 000000000..e4beb6b88
--- /dev/null
+++ b/internal/build/common_detect.mk
@@ -0,0 +1,15 @@
+CXX_STD = c++11
+
+ifeq ($(THRUST_TEST),1)
+  include $(ROOTDIR)/build/getprofile.mk
+  include $(ROOTDIR)/build/config/$(PROFILE).mk
+else
+  ifdef VULCAN_TOOLKIT_BASE
+    include $(VULCAN_TOOLKIT_BASE)/build/getprofile.mk
+    include $(VULCAN_TOOLKIT_BASE)/build/config/$(PROFILE).mk
+  else
+    include $(ROOTDIR)/build/getprofile.mk
+    include $(ROOTDIR)/build/config/$(PROFILE).mk
+  endif  # VULCAN_TOOLKIT_BASE
+endif  # THRUST_TEST
+
diff --git a/internal/build/generic_example.mk b/internal/build/generic_example.mk
new file mode 100644
index 000000000..8fe562245
--- /dev/null
+++ b/internal/build/generic_example.mk
@@ -0,0 +1,13 @@
+# Generic project mk that is included by examples mk
+EXECUTABLE := $(EXAMPLE_NAME)
+BUILD_SRC  := $(ROOTDIR)/thrust/$(EXAMPLE_SRC)
+
+include $(ROOTDIR)/thrust/internal/build/common_detect.mk
+
+EXAMPLE_MAKEFILE := $(join $(dir $(BUILD_SRC)), $(basename $(notdir $(BUILD_SRC))).mk)
+ifneq ("$(wildcard $(EXAMPLE_MAKEFILE))","") # Check if the file exists.
+  include $(EXAMPLE_MAKEFILE)
+endif
+
+include $(ROOTDIR)/thrust/internal/build/common_build.mk
+
diff --git a/internal/build/generic_test.mk b/internal/build/generic_test.mk
new file mode 100644
index 000000000..1be548c93
--- /dev/null
+++ b/internal/build/generic_test.mk
@@ -0,0 +1,23 @@
+# Generic project mk that is included by unit tests mk
+EXECUTABLE := $(TEST_NAME)
+BUILD_SRC  := $(ROOTDIR)/thrust/$(TEST_SRC)
+
+ifdef VULCAN
+  INCLUDES_ABSPATH += $(VULCAN_TOOLKIT_BASE)/thrust/testing
+else
+  INCLUDES_ABSPATH += $(ROOTDIR)/thrust/testing
+endif
+
+PROJ_LIBRARIES += testframework
+
+THRUST_TEST := 1
+
+include $(ROOTDIR)/thrust/internal/build/common_detect.mk
+
+TEST_MAKEFILE := $(join $(dir $(BUILD_SRC)), $(basename $(notdir $(BUILD_SRC))).mk)
+ifneq ("$(wildcard $(TEST_MAKEFILE))","") # Check if the file exists.
+  include $(TEST_MAKEFILE)
+endif
+
+include $(ROOTDIR)/thrust/internal/build/common_build.mk
+
diff --git a/internal/build/testframework.mk b/internal/build/testframework.mk
new file mode 100644
index 000000000..5c941f031
--- /dev/null
+++ b/internal/build/testframework.mk
@@ -0,0 +1,17 @@
+STATIC_LIBRARY := testframework
+
+SRC_PATH := $(ROOTDIR)/thrust/testing/
+BUILD_SRC := unittest/testframework.cu
+
+CUSRC := unittest/cuda/testframework.cu
+$(CUSRC).CUDACC_FLAGS    := -I$(ROOTDIR)/thrust/testing/cuda/
+$(CUSRC).TARGET_BASENAME := testframework_cu
+CU_FILES += $(CUSRC)
+
+INCLUDES_ABSPATH += $(ROOTDIR)/thrust/testing
+
+THRUST_TEST := 1
+
+include $(ROOTDIR)/thrust/internal/build/common_detect.mk
+include $(ROOTDIR)/thrust/internal/build/common_build.mk
+
diff --git a/internal/build/warningstester.mk b/internal/build/warningstester.mk
new file mode 100644
index 000000000..f2ceecd8e
--- /dev/null
+++ b/internal/build/warningstester.mk
@@ -0,0 +1,63 @@
+USE_NEW_PROJECT_MK := 1
+
+EXECUTABLE        := warningstester
+PROJ_DIR          := internal/build
+#GENCODE           :=
+
+ifndef PROFILE
+ifdef VULCAN_TOOLKIT_BASE
+include $(VULCAN_TOOLKIT_BASE)/build/getprofile.mk
+include $(VULCAN_TOOLKIT_BASE)/build/config/$(PROFILE).mk
+else
+include $(ROOTDIR)/build/getprofile.mk
+include $(ROOTDIR)/build/config/$(PROFILE).mk
+endif
+endif
+
+ARCH_NEG_FILTER += 20 21
+
+ifdef VULCAN_TOOLKIT_BASE
+include $(VULCAN_TOOLKIT_BASE)/build/config/DetectOS.mk
+else
+include $(ROOTDIR)/build/config/DetectOS.mk
+endif
+
+CU_FILES += ../test/warningstester.cu
+
+# Thrust includes
+ifdef VULCAN
+INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/include
+INCLUDES += $(VULCAN_INSTALL_DIR)/cuda/_internal/cudart
+INCLUDES += $(VULCAN_TOOLKIT_BASE)/cub
+else
+INCLUDES += ../..
+INCLUDES += ../../../cuda/tools/cudart
+INCLUDES += ../../../cub
+endif
+
+# Location of generated include file that includes all Thrust public headers
+GENERATED_SOURCES = $(BUILT_CWD)
+CUDACC_FLAGS += -I$(GENERATED_SOURCES)
+
+include $(ROOTDIR)/thrust/internal/build/common_compiler.mk
+
+ifdef VULCAN_TOOLKIT_BASE
+include $(VULCAN_TOOLKIT_BASE)/build/common.mk
+else
+include $(ROOTDIR)/build/common.mk
+endif
+
+warningstester$(OBJSUFFIX): $(GENERATED_SOURCES)/warningstester.h
+
+$(GENERATED_SOURCES)/warningstester.h: FORCE
+ifdef VULCAN
+ifeq ($(TARGET_ARCH), ppc64le)
+	$(PYTHON) $(SRC_CWD)/warningstester_create_uber_header.py $(VULCAN_INSTALL_DIR)/cuda/targets/ppc64le-linux/include > $@
+else
+	$(PYTHON) $(SRC_CWD)/warningstester_create_uber_header.py $(VULCAN_INSTALL_DIR)/cuda/include > $@
+endif
+else
+	$(PYTHON) $(SRC_CWD)/warningstester_create_uber_header.py $(SRC_CWD)/../.. > $@
+endif
+
+FORCE:
diff --git a/internal/build/warningstester_create_uber_header.py b/internal/build/warningstester_create_uber_header.py
new file mode 100644
index 000000000..cef19a43d
--- /dev/null
+++ b/internal/build/warningstester_create_uber_header.py
@@ -0,0 +1,54 @@
+'''
+Helper script for creating a header file that includes all of Thrust's
+public headers.  This is useful for instance, to quickly check that
+all the thrust headers obey proper syntax or are warning free.
+
+This script simply outputs a list of C-style #include's to the standard
+output--this should be redirected to a header file by the caller.
+'''
+
+import sys
+import os
+import re
+from stat import *
+
+thrustdir = sys.argv[1]
+
+def find_headers(base_dir, rel_dir, exclude = ['\B']):
+    '''
+    Recursively find all *.h files inside base_dir/rel_dir,
+    except any that match the exclude regexp list
+    '''
+    assert(type(exclude) == list)
+    full_dir = base_dir + '/' + rel_dir
+    result = []
+    for f in os.listdir(full_dir):
+        rel_file = rel_dir + '/' + f
+        for e in exclude:
+            if re.match(e, rel_file):
+                break
+        else:
+            if f.endswith('.h'):
+                result.append(rel_file)
+            elif S_ISDIR(os.stat(full_dir + '/' + f).st_mode):
+                result.extend(find_headers(base_dir, rel_file, exclude))
+    return result
+
+print('/* File is generated by ' + sys.argv[0] + ' */')
+
+exclude_re = ['.*/detail$',
+              'thrust/iterator',
+              'thrust/random',
+              'thrust/system/tbb']
+headers = find_headers(thrustdir, 'thrust', exclude_re)
+
+if len(headers) == 0:
+    print('#error no include files found\n')
+
+print('#define THRUST_CPP11_REQUIRED_NO_ERROR')
+print('#define THRUST_CPP14_REQUIRED_NO_ERROR')
+print('#define THRUST_MODERN_GCC_REQUIRED_NO_ERROR')
+for h in headers:
+    print('#include <' + h + '>')
+
+exit()
diff --git a/internal/racecheck.sh b/internal/racecheck.sh
new file mode 100755
index 000000000..0654ee98c
--- /dev/null
+++ b/internal/racecheck.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+MEMCHECK=/work/nightly/memcheck/bin/x86_64_Linux_release/cuda-memcheck 
+
+#########################
+
+files=`ls thrust.test.*`;
+files=`ls thrust.example.*`;
+
+#########################
+
+nfiles=0
+for fn in $files; do
+  nfiles=$((nfiles + 1))
+done
+j=1
+for fn in $files; do
+  echo " ----------------------------------------------------------------------"
+  echo "  *** MEMCHECK *** [$j/$nfiles] $fn"
+  echo " ----------------------------------------------------------------------"
+  $MEMCHECK --tool memcheck ./$fn --verbose
+  echo " ----------------------------------------------------------------------"
+  echo "  *** RACECHECK *** [$j/$nfiles] $fn"
+  echo " ----------------------------------------------------------------------"
+  $MEMCHECK --tool racecheck ./$fn --verbose --sizes=small
+  j=$((j+1))
+done;
diff --git a/internal/rename_cub_namespace.sh b/internal/rename_cub_namespace.sh
new file mode 100755
index 000000000..7a539e5d6
--- /dev/null
+++ b/internal/rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to add a THRUST_
+# prefix to CUB's namespace macro.
+
+sed -i -e 's/CUB_NS_P/THRUST_CUB_NS_P/g' `find . -type f`
+
diff --git a/internal/reverse_rename_cub_namespace.sh b/internal/reverse_rename_cub_namespace.sh
new file mode 100755
index 000000000..bc4858449
--- /dev/null
+++ b/internal/reverse_rename_cub_namespace.sh
@@ -0,0 +1,7 @@
+#! /bin/bash
+
+# Run this in //sw/gpgpu/thrust/thrust/system/cuda/detail/cub to undo the
+# renaming of CUB's namespace macro.
+
+sed -i -e 's|THRUST_CUB_NS_P|CUB_NS_P|g' `find . -type f`
+
diff --git a/internal/scripts/eris_perf.py b/internal/scripts/eris_perf.py
new file mode 100755
index 000000000..580471101
--- /dev/null
+++ b/internal/scripts/eris_perf.py
@@ -0,0 +1,189 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+from sys import exit 
+
+from os.path import join, dirname, basename, realpath
+
+from csv import DictReader as csv_dict_reader
+
+from subprocess import Popen
+
+from argparse import ArgumentParser as argument_parser
+
+###############################################################################
+
+def printable_cmd(c):
+  """Converts a `list` of `str`s representing a shell command to a printable 
+  `str`."""
+  return " ".join(map(lambda e: '"' + str(e) + '"', c))
+
+###############################################################################
+
+def print_file(p):
+  """Open the path `p` and print its contents to `stdout`."""
+  print "********************************************************************************"
+  with open(p) as f:
+    for line in f:
+      print line,
+  print "********************************************************************************"
+
+###############################################################################
+
+ap = argument_parser(
+  description = (
+    "CUDA Eris driver script: runs a benchmark suite multiple times, combines "
+    "the results, and outputs them in the CUDA Eris performance result format."
+  )
+)
+
+ap.add_argument(
+  "-b", "--benchmark", 
+  help = ("The location of the benchmark suite executable to run."),
+  type = str,
+  default = join(dirname(realpath(__file__)), "bench"), 
+  metavar = "R"
+)
+
+ap.add_argument(
+  "-p", "--postprocess", 
+  help = ("The location of the postprocessing script to run to combine the "
+          "results."),
+  type = str,
+  default = join(dirname(realpath(__file__)), "combine_benchmark_results.py"),
+  metavar = "R"
+)
+
+ap.add_argument(
+  "-r", "--runs", 
+  help = ("Run the benchmark suite `R` times.a),"),
+  type = int, default = 5, 
+  metavar = "R"
+)
+
+args = ap.parse_args()
+
+if args.runs <= 0:
+  print "ERROR: `--runs` must be greater than `0`."
+  ap.print_help()
+  exit(1)
+
+BENCHMARK_EXE             = args.benchmark
+BENCHMARK_NAME            = basename(BENCHMARK_EXE)
+POSTPROCESS_EXE           = args.postprocess
+OUTPUT_FILE_NAME          = lambda i: BENCHMARK_NAME + "_" + str(i) + ".csv"
+COMBINED_OUTPUT_FILE_NAME = BENCHMARK_NAME + "_combined.csv"
+
+###############################################################################
+
+print '&&&& RUNNING {0}'.format(BENCHMARK_NAME)
+
+print '#### RUNS {0}'.format(args.runs)
+
+###############################################################################
+
+print '#### CMD {0}'.format(BENCHMARK_EXE)
+
+for i in xrange(args.runs):
+  with open(OUTPUT_FILE_NAME(i), "w") as output_file:
+    print '#### RUN {0} OUTPUT -> {1}'.format(i, OUTPUT_FILE_NAME(i))
+
+    p = None
+
+    try:
+      p = Popen(BENCHMARK_EXE, stdout = output_file, stderr = output_file)
+      p.communicate()
+    except OSError as ex:
+      print_file(OUTPUT_FILE_NAME(i))
+      print '#### ERROR Caught OSError `{0}`.'.format(ex)
+      print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+      exit(-1)
+
+  print_file(OUTPUT_FILE_NAME(i))
+
+  if p.returncode != 0:
+    print '#### ERROR Process exited with code {0}.'.format(p.returncode)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(p.returncode)
+
+###############################################################################
+
+post_cmd = [POSTPROCESS_EXE]
+
+# Add dependent variable options.
+post_cmd += ["-dSTL Average Walltime,STL Walltime Uncertainty,STL Trials"]
+post_cmd += ["-dSTL Average Throughput,STL Throughput Uncertainty,STL Trials"]
+post_cmd += ["-dThrust Average Walltime,Thrust Walltime Uncertainty,Thrust Trials"]
+post_cmd += ["-dThrust Average Throughput,Thrust Throughput Uncertainty,Thrust Trials"]
+
+post_cmd += [OUTPUT_FILE_NAME(i) for i in range(args.runs)] 
+
+print '#### CMD {0}'.format(printable_cmd(post_cmd))
+
+with open(COMBINED_OUTPUT_FILE_NAME, "w") as output_file:
+  p = None
+
+  try:
+    p = Popen(post_cmd, stdout = output_file, stderr = output_file)
+    p.communicate()
+  except OSError as ex:
+    print_file(COMBINED_OUTPUT_FILE_NAME)
+    print '#### ERROR Caught OSError `{0}`.'.format(ex)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(-1)
+
+  print_file(COMBINED_OUTPUT_FILE_NAME)
+
+  if p.returncode != 0:
+    print '#### ERROR Process exited with code {0}.'.format(p.returncode)
+    print '&&&& FAILED {0}'.format(BENCHMARK_NAME)
+    exit(p.returncode)
+
+  with open(COMBINED_OUTPUT_FILE_NAME) as input_file:
+    reader = csv_dict_reader(input_file)
+
+    variable_units = reader.next() # Get units header row.
+
+    distinguishing_variables = reader.fieldnames
+
+    measured_variables = [
+      ("STL Average Throughput",    "+"),
+      ("Thrust Average Throughput", "+")
+    ]
+
+    for record in reader:
+      for variable, directionality in measured_variables:
+        # Don't monitor regressions for STL implementations, nvbug 28980890:
+        if "STL" in variable:
+          continue
+        print "&&&& PERF {0}_{1}_{2}bit_{3}mib_{4} {5} {6}{7}".format(
+          record["Algorithm"],
+          record["Element Type"],
+          record["Element Size"],
+          record["Total Input Size"],
+          variable.replace(" ", "_").lower(),
+          record[variable],
+          directionality,
+          variable_units[variable]
+        )
+
+###############################################################################
+                  
+print '&&&& PASSED {0}'.format(BENCHMARK_NAME)
+
diff --git a/internal/scripts/refresh_from_github2.sh b/internal/scripts/refresh_from_github2.sh
new file mode 100755
index 000000000..6b977bcf3
--- /dev/null
+++ b/internal/scripts/refresh_from_github2.sh
@@ -0,0 +1,96 @@
+branch="main"
+
+while getopts "hb:c:" opt; do
+    case $opt in
+        h)
+        echo "Usage: $0 [-h] [-b <github_branch_name>] -c <P4_changelist>"
+        exit 1
+        ;;
+
+        b)
+        branch=$OPTARG
+        ;;
+
+        c)
+        changelist=$OPTARG
+        ;;
+
+        /?)
+        echo "Invalid option: -$OPTARG" >&2;
+        exit 1
+        ;;
+
+        :)
+        echo "Option -$OPTARG requires an argument";
+        exit 1
+        ;;
+    esac
+done
+
+if [ "$changelist" == "" ]; then
+    echo "Missing required option -c to specify P4 changelist to put changed files into"
+    exit 1
+fi
+
+# Cause script to exit on any command that results in an error
+set -e
+
+echo "Downloading thrust code from the $branch branch into /tmp/thrust-${branch}"
+rm -rf /tmp/thrust-${branch}
+git clone -q git://github.com/NVIDIA/thrust.git -b ${branch} /tmp/thrust-${branch}
+
+cd `dirname $0`/../..
+echo "Changed current directory to `pwd`"
+
+vulcan_files=`echo *.vlcc *.vlct` 
+logdir=`mktemp -d /tmp/tmp.XXXXXXXX`
+echo "Logging p4 command outputs to temporary directory $logdir"
+for i in *; do
+    if [[ "$i" != "internal" && "$i" != "Makefile" ]]; then
+        ii="$i";
+        if [ -d $i ]; then ii="$i/..."; fi
+        echo "Reverting, force syncing, and then removing $ii"
+        p4 revert $ii >> $logdir/$i.revert.log 2>&1
+        p4 sync -f $ii >> $logdir/$i.sync.log 2>&1
+        rm -rf $i
+    fi
+done
+
+echo "Copying downloaded thrust code to p4 client"
+cp -R /tmp/thrust-${branch}/* .
+find . -name ".gitignore" | xargs -n 1 rm
+
+echo "Checking if version has been bumped"
+new_version=`grep "#define THRUST_VERSION" thrust/version.h | sed -e "s/#define THRUST_VERSION //"`
+old_version=`p4 print thrust/version.h | grep "#define THRUST_VERSION" | sed -e "s/#define THRUST_VERSION //"`
+if [ "$new_version" != "$old_version" ]; then
+    p4 edit internal/test/version.gold
+    new_version_print="$(( $new_version / 100000 )).$(( ($new_version / 100) % 1000 )).$(( $new_version % 100 ))"
+    sed -e "s/v[0-9\.][0-9\.]*/v${new_version_print}/" internal/test/version.gold > internal/test/version.gold.tmp
+    mv internal/test/version.gold.tmp internal/test/version.gold
+    echo "Updated version.gold to version $new_version_print"
+else
+    echo "Version has not changed"
+fi
+
+echo "Reconciling changed code into changelist $changelist"
+p4 reconcile -c $changelist ... >> $logdir/reconcile.log 2>&1
+p4 revert -c $changelist Makefile $vulcan_files internal/... >> $logdir/internal_files_revert.log 2>&1
+
+echo "Looking for examples that were added"
+for e in `find examples -name "*.cu"`; do
+    if [ ! -e internal/build/`basename $e .cu`.mk ]; then
+	echo "ADDED: `basename $e .cu`";
+    fi
+done
+
+echo "Looking for examples that were deleted or moved"
+for e in `find internal/build -name "*.mk"`; do
+    ee=`basename $e .mk`
+    case "$ee" in
+	generic_example | unittester* | warningstester) continue;;
+    esac
+    if [  "`find examples -name $ee.cu`" == "" ]; then
+	echo "DELETED: $ee";
+    fi;
+done
diff --git a/internal/scripts/tounix b/internal/scripts/tounix
new file mode 100755
index 000000000..c39a054a1
--- /dev/null
+++ b/internal/scripts/tounix
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+# converts all files in the current directory with extensions .h .inl or .cu to unix format
+
+#find . -type f \( -name "*.h" -o -name "*.inl" -o -name "*.cu" \) -a \( -not -wholename "*\.hg/*" \) -print
+find . -type f \( -name "*.h" -o -name "*.inl" -o -name "*.cu" \) -a \( -not -wholename "*\.hg/*" \) -exec fromdos -d {} \;
+
diff --git a/internal/scripts/wiki2tex.py b/internal/scripts/wiki2tex.py
new file mode 100644
index 000000000..67f658b2d
--- /dev/null
+++ b/internal/scripts/wiki2tex.py
@@ -0,0 +1,194 @@
+'''
+Convert Google Code .wiki files into .tex formatted files.
+
+Output is designed to be included within a larger TeX project, it is
+not standalone.
+
+'''
+
+import sys
+import re
+import codecs
+
+print(sys.argv)
+
+'''
+A "rule" is a begin tag, an end tag, and how to reformat the inner text
+(function)
+'''
+
+def encase(pre, post, strip=False):
+    """Return a function that prepends pre and postpends post"""
+    def f(txt):
+        if strip:
+            return pre + txt.strip() + post
+        else:
+            return pre + txt + post
+    return f
+
+def constant(text):
+    def f(txt):
+        return text
+    return f
+
+def encase_with_rules(pre, post, rules, strip=False):
+    def f(txt):
+        if strip:
+            return pre + apply_rules(txt, rules).strip() + post
+        else:
+            return pre + apply_rules(txt, rules) + post
+    return f
+
+def encase_escape_underscore(pre, post):
+    def f(txt):
+        txt = sub(r'_', r'\_', txt)
+        return pre + txt + post
+    return f
+
+def sub(pat, repl, txt):
+    """Substitute in repl for pat in txt, txt can be multiple lines"""
+    return re.compile(pat, re.MULTILINE).sub(repl, txt)
+
+def process_list(rules):
+    def f(txt):
+        txt = '  *' + txt # was removed to match begin tag of list
+        res = '\\begin{itemize}\n'
+        for ln in txt.split('\n'):
+            # Convert "  *" to "\item "
+            ln = sub(r'^  \*', r'\\item ', ln)
+            res += apply_rules(ln, rules) + '\n'
+        res += '\\end{itemize}\n'
+        return res
+    return f
+
+def process_link(rules):
+    def f(txt):
+        lst = txt.split(' ')
+        lnk = lst[0]
+        desc = apply_rules(' '.join(lst[1:]), rules)
+        if lnk[:7] == 'http://':
+            desc = apply_rules(' '.join(lst[1:]), rules)
+            return r'\href{' + lnk + r'}{' + desc + r'}'
+        if len(lst) > 1:
+            return r'\href{}{' + desc + r'}'
+        return r'\href{}{' + lnk + r'}'
+    return f
+
+# Some rules can be used inside some other rules (backticks in section names)
+
+link_rules = [
+    ['_', '', constant(r'\_')],
+]
+
+section_rules = [
+    ['`', '`', encase_escape_underscore(r'\texttt{', r'}')],
+]
+
+item_rules = [
+    ['`', '`', encase(r'\verb|', r'|')],
+    ['[', ']', process_link(link_rules)],
+]
+
+# Main rules for Latex formatting
+
+rules = [
+    ['{{{', '}}}', encase(r'\begin{lstlisting}[language=c++]', r'\end{lstlisting}')],
+    ['[', ']', process_link(link_rules)],
+    ['  *', '\n\n', process_list(item_rules)],
+    ['"', '"', encase("``", "''")],
+    ['`', '`', encase(r'\verb|', r'|')],
+    ['*', '*', encase(r'\emph{', r'}')],
+    ['_', '_', encase(r'\emph{', r'}')],
+    ['==', '==', encase_with_rules(r'\section{', r'}', section_rules, True)],
+    ['=', '=', encase_with_rules(r'\chapter{', r'}', section_rules, True)],
+    ['(e.g. f(x) -> y and f(x,y) -> ', 'z)', constant(r'(e.g. $f(x)\to y$ and $f(x,y)\to z$)')],
+]
+
+def match_rules(txt, rules):
+    """Find rule that first matches in txt"""
+    # Find first begin tag
+    first_begin_loc = 10e100
+    matching_rule = None
+    for rule in rules:
+        begin_tag, end_tag, func = rule
+        loc = txt.find(begin_tag)
+        if loc > -1 and loc < first_begin_loc:
+            first_begin_loc = loc
+            matching_rule = rule
+    return (matching_rule, first_begin_loc)
+
+def apply_rules(txt, rules):
+    """Apply set of rules to give txt, return transformed version of txt"""
+    matching_rule, first_begin_loc = match_rules(txt, rules)
+    if matching_rule is None:
+        return txt
+    begin_tag, end_tag, func = matching_rule
+    end_loc = txt.find(end_tag, first_begin_loc + 1)
+    if end_loc == -1:
+        sys.exit('Could not find end tag {0} after position {1}'.format(end_tag, first_begin_loc + 1))
+    inner_txt = txt[first_begin_loc + len(begin_tag) : end_loc]
+    # Copy characters up until begin tag
+    # Then have output of rule function on inner text
+    new_txt_start = txt[:first_begin_loc] + func(inner_txt)
+    # Follow with the remaining processed text
+    remaining_txt = txt[end_loc + len(end_tag):]
+    return new_txt_start + apply_rules(remaining_txt, rules)
+
+def split_sections(contents):
+    """Given one string of all file contents, return list of sections
+    
+    Return format is list of pairs, each pair has section title
+    and list of lines.  Result is ordered as the original input.
+
+    """
+    res = []
+    cur_section = ''
+    section = []
+    for ln in contents.split('\n'):
+        if len(ln) > 0 and ln[0] == '=':
+            # remove = formatting from line
+            section_title = sub(r'^\=+ (.*) \=+', r'\1', ln)
+            res.append((cur_section, section))
+            cur_section = section_title
+            section = [ln]
+        else:
+            section.append(ln)
+    res.append((cur_section, section))
+    return res
+
+def filter_sections(splitinput, removelst):
+    """Take split input and remove sections in removelst"""
+    res = []
+    for sectname, sectcontents in splitinput:
+        if sectname in removelst:
+            pass
+        else:
+            res.extend(sectcontents)
+    # convert to single string for output
+    return '\n'.join(res)
+
+
+def main():
+    infile = codecs.open(sys.argv[1], encoding='utf-8')
+    outfile = codecs.open(sys.argv[2], mode='w', encoding='utf-8')
+    
+    contents = infile.read()
+    
+    # Remove first three lines
+    contents = '\n'.join(contents.split('\n')[3:])
+    
+    # Split sections and filter out some of them
+    sections = split_sections(contents)
+    contents = filter_sections(sections, ['Introduction', 'Prerequisites', 'Simple Example'])
+    
+    # Convert to latex format
+    contents = apply_rules(contents, rules)
+    
+    infile.close()
+    outfile.write(contents)
+    outfile.close()
+    return 0
+
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/internal/test/dvstest.lst b/internal/test/dvstest.lst
new file mode 100755
index 000000000..ffe580f08
--- /dev/null
+++ b/internal/test/dvstest.lst
@@ -0,0 +1,425 @@
+TestAdjacentDifference
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceDispatchExplicit
+TestAdjacentDifferenceDispatchImplicit
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAllOfDevice
+TestAllOfDispatchExplicit
+TestAllOfDispatchImplicit
+TestAllOfHost
+TestAnyOfDevice
+TestAnyOfDispatchExplicit
+TestAnyOfDispatchImplicit
+TestAnyOfHost
+TestComputeCapability
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDispatchExplicit
+TestCopyDispatchImplicit
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfDispatchExplicit
+TestCopyIfDispatchImplicit
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilDispatchExplicit
+TestCopyIfStencilDispatchImplicit
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountDispatchExplicit
+TestCountDispatchImplicit
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestFill
+TestFillDiscardIterator
+TestFillDispatchExplicit
+TestFillDispatchImplicit
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDiscardIterator
+TestFillNDispatchExplicit
+TestFillNDispatchImplicit
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestForEach
+TestForEachDispatchExplicit
+TestForEachDispatchImplicit
+TestForEachN
+TestForEachNDispatchExplicit
+TestForEachNDispatchImplicit
+TestForEachNSimpleAnySystem
+TestForEachNSimpleDevice
+TestForEachNSimpleHost
+TestForEachNWithLargeTypes
+TestForEachSimpleAnySystem
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherDispatchExplicit
+TestGatherDispatchImplicit
+TestGatherIf
+TestGatherIfDispatchExplicit
+TestGatherIfDispatchImplicit
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateDispatchExplicit
+TestGenerateDispatchImplicit
+TestGenerateNDispatchExplicit
+TestGenerateNDispatchImplicit
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestInnerProduct
+TestInnerProductDispatchExplicit
+TestInnerProductDispatchImplicit
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPlainOldData
+TestIsTrivialIterator
+TestMaxActiveBlocks
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementDispatchExplicit
+TestMaxElementDispatchImplicit
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMerge
+TestMergeDescending
+TestMergeDispatchExplicit
+TestMergeDispatchImplicit
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeToDiscardIterator
+TestMinElement
+TestMinElementDispatchExplicit
+TestMinElementDispatchImplicit
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementDispatchExplicit
+TestMinMaxElementDispatchImplicit
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestNoneOfDevice
+TestNoneOfDispatchExplicit
+TestNoneOfDispatchImplicit
+TestNoneOfHost
+TestPartition
+TestPartitionCopy
+TestPartitionCopyDispatchExplicit
+TestPartitionCopyDispatchImplicit
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyStencil
+TestPartitionCopyStencilDispatchExplicit
+TestPartitionCopyStencilDispatchImplicit
+TestPartitionCopyStencilSimpleDevice
+TestPartitionCopyStencilSimpleHost
+TestPartitionCopyStencilToDiscardIterator
+TestPartitionCopyToDiscardIterator
+TestPartitionDispatchExplicit
+TestPartitionDispatchImplicit
+TestPartitionPointDevice
+TestPartitionPointDispatchExplicit
+TestPartitionPointDispatchImplicit
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionStencil
+TestPartitionStencilDispatchExplicit
+TestPartitionStencilDispatchImplicit
+TestPartitionStencilSimpleDevice
+TestPartitionStencilSimpleHost
+TestPartitionStencilZipIteratorDevice
+TestPartitionStencilZipIteratorHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestReduce
+TestReduceByKey
+TestReduceByKeyDispatchExplicit
+TestReduceByKeyDispatchImplicit
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceCountingIterator
+TestReduceDispatchExplicit
+TestReduceDispatchImplicit
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyDispatchExplicit
+TestRemoveCopyDispatchImplicit
+TestRemoveCopyIf
+TestRemoveCopyIfDispatchExplicit
+TestRemoveCopyIfDispatchImplicit
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilDispatchExplicit
+TestRemoveCopyIfStencilDispatchImplicit
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveDispatchExplicit
+TestRemoveDispatchImplicit
+TestRemoveIf
+TestRemoveIfDispatchExplicit
+TestRemoveIfDispatchImplicit
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilDispatchExplicit
+TestRemoveIfStencilDispatchImplicit
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyDispatchExplicit
+TestReplaceCopyDispatchImplicit
+TestReplaceCopyIf
+TestReplaceCopyIfDispatchExplicit
+TestReplaceCopyIfDispatchImplicit
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilDispatchExplicit
+TestReplaceCopyIfStencilDispatchImplicit
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceDispatchExplicit
+TestReplaceDispatchImplicit
+TestReplaceIf
+TestReplaceIfDispatchExplicit
+TestReplaceIfDispatchImplicit
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilDispatchExplicit
+TestReplaceIfStencilDispatchImplicit
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopyDispatchExplicit
+TestReverseCopyDispatchImplicit
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseDispatchExplicit
+TestReverseDispatchImplicit
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestSetIntersection
+TestSetIntersectionDispatchExplicit
+TestSetIntersectionDispatchImplicit
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceDispatchExplicit
+TestSetSymmetricDifferenceDispatchImplicit
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionDispatchExplicit
+TestSetUnionDispatchImplicit
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopyDispatchExplicit
+TestStablePartitionCopyDispatchImplicit
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyStencil
+TestStablePartitionCopyStencilDispatchExplicit
+TestStablePartitionCopyStencilDispatchImplicit
+TestStablePartitionCopyStencilSimpleDevice
+TestStablePartitionCopyStencilSimpleHost
+TestStablePartitionCopyStencilToDiscardIterator
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionDispatchExplicit
+TestStablePartitionDispatchImplicit
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionStencil
+TestStablePartitionStencilDispatchExplicit
+TestStablePartitionStencilDispatchImplicit
+TestStablePartitionStencilSimpleDevice
+TestStablePartitionStencilSimpleHost
+TestStablePartitionStencilZipIteratorDevice
+TestStablePartitionStencilZipIteratorHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestTransformBinary
+TestTransformBinaryCountingIterator
+TestTransformBinaryDispatchExplicit
+TestTransformBinaryDispatchImplicit
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformExclusiveScanDispatchExplicit
+TestTransformExclusiveScanDispatchImplicit
+TestTransformIfBinary
+TestTransformIfBinaryDispatchExplicit
+TestTransformIfBinaryDispatchImplicit
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryDispatchExplicit
+TestTransformIfUnaryDispatchImplicit
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilDispatchExplicit
+TestTransformIfUnaryNoStencilDispatchImplicit
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformInclusiveScanDispatchExplicit
+TestTransformInclusiveScanDispatchImplicit
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIterator
+TestTransformUnaryDispatchExplicit
+TestTransformUnaryDispatchImplicit
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeyCopyDispatchExplicit
+TestUniqueByKeyCopyDispatchImplicit
+TestUniqueByKeyDispatchExplicit
+TestUniqueByKeyDispatchImplicit
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopyDispatchExplicit
+TestUniqueCopyDispatchImplicit
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueDispatchExplicit
+TestUniqueDispatchImplicit
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorBinarySearch
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchDispatchExplicit
+TestVectorBinarySearchDispatchImplicit
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorLowerBound
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundDispatchExplicit
+TestVectorLowerBoundDispatchImplicit
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorUpperBound
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundDispatchExplicit
+TestVectorUpperBoundDispatchImplicit
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
diff --git a/internal/test/thrust.example.arbitrary_transformation.filecheck b/internal/test/thrust.example.arbitrary_transformation.filecheck
new file mode 100644
index 000000000..81b25ae23
--- /dev/null
+++ b/internal/test/thrust.example.arbitrary_transformation.filecheck
@@ -0,0 +1,5 @@
+     CHECK: 3 + 6 * 2 = 15
+CHECK-NEXT: 4 + 7 * 5 = 39
+CHECK-NEXT: 0 + 2 * 7 = 14
+CHECK-NEXT: 8 + 1 * 4 = 12
+CHECK-NEXT: 2 + 8 * 3 = 26
diff --git a/internal/test/thrust.example.basic_vector.filecheck b/internal/test/thrust.example.basic_vector.filecheck
new file mode 100644
index 000000000..ab17b8251
--- /dev/null
+++ b/internal/test/thrust.example.basic_vector.filecheck
@@ -0,0 +1,8 @@
+     CHECK: H has size 4
+CHECK-NEXT: H[0] = 14
+CHECK-NEXT: H[1] = 20
+CHECK-NEXT: H[2] = 38
+CHECK-NEXT: H[3] = 46
+CHECK-NEXT: H now has size 2
+CHECK-NEXT: D[0] = 99
+CHECK-NEXT: D[1] = 88
diff --git a/internal/test/thrust.example.bounding_box.filecheck b/internal/test/thrust.example.bounding_box.filecheck
new file mode 100644
index 000000000..ddbe4a201
--- /dev/null
+++ b/internal/test/thrust.example.bounding_box.filecheck
@@ -0,0 +1 @@
+     CHECK: bounding box (0.000022,0.037300) (0.967956,0.995085)
diff --git a/internal/test/thrust.example.bucket_sort2d.filecheck b/internal/test/thrust.example.bucket_sort2d.filecheck
new file mode 100644
index 000000000..688e49cba
--- /dev/null
+++ b/internal/test/thrust.example.bucket_sort2d.filecheck
@@ -0,0 +1,55 @@
+     CHECK: bucket (150, 50)'s list of points:
+CHECK-NEXT: (0.751041,0.505377)
+CHECK-NEXT: (0.750647,0.505272)
+CHECK-NEXT: (0.752243,0.509601)
+CHECK-NEXT: (0.750937,0.503519)
+CHECK-NEXT: (0.753879,0.506217)
+CHECK-NEXT: (0.754956,0.501953)
+CHECK-NEXT: (0.754439,0.502353)
+CHECK-NEXT: (0.754128,0.501410)
+CHECK-NEXT: (0.750917,0.502195)
+CHECK-NEXT: (0.754024,0.507150)
+CHECK-NEXT: (0.750565,0.502896)
+CHECK-NEXT: (0.753444,0.509374)
+CHECK-NEXT: (0.754874,0.506500)
+CHECK-NEXT: (0.754646,0.508721)
+CHECK-NEXT: (0.753527,0.504378)
+CHECK-NEXT: (0.754563,0.502366)
+CHECK-NEXT: (0.751227,0.502014)
+CHECK-NEXT: (0.753009,0.508329)
+CHECK-NEXT: (0.752284,0.500607)
+CHECK-NEXT: (0.753341,0.503853)
+CHECK-NEXT: (0.751787,0.501364)
+CHECK-NEXT: (0.750171,0.500588)
+CHECK-NEXT: (0.752243,0.501621)
+CHECK-NEXT: (0.752056,0.509570)
+CHECK-NEXT: (0.752263,0.507172)
+CHECK-NEXT: (0.754024,0.501935)
+CHECK-NEXT: (0.751538,0.500686)
+CHECK-NEXT: (0.754024,0.508004)
+CHECK-NEXT: (0.750358,0.506688)
+CHECK-NEXT: (0.751083,0.505733)
+CHECK-NEXT: (0.750150,0.505805)
+CHECK-NEXT: (0.750585,0.505232)
+CHECK-NEXT: (0.753838,0.508040)
+CHECK-NEXT: (0.750461,0.501308)
+CHECK-NEXT: (0.753527,0.501546)
+CHECK-NEXT: (0.751145,0.508224)
+CHECK-NEXT: (0.751953,0.506566)
+CHECK-NEXT: (0.750378,0.502955)
+CHECK-NEXT: (0.751704,0.507102)
+CHECK-NEXT: (0.754646,0.502674)
+CHECK-NEXT: (0.750772,0.501464)
+CHECK-NEXT: (0.752325,0.502761)
+CHECK-NEXT: (0.752408,0.502305)
+CHECK-NEXT: (0.751000,0.508639)
+CHECK-NEXT: (0.754252,0.506525)
+CHECK-NEXT: (0.753175,0.504877)
+CHECK-NEXT: (0.753071,0.502682)
+CHECK-NEXT: (0.750109,0.503627)
+CHECK-NEXT: (0.754936,0.506406)
+CHECK-NEXT: (0.754521,0.500953)
+CHECK-NEXT: (0.753941,0.509584)
+CHECK-NEXT: (0.754915,0.504699)
+CHECK-NEXT: (0.751476,0.509525)
+CHECK-NEXT: (0.752823,0.507129)
diff --git a/internal/test/thrust.example.constant_iterator.filecheck b/internal/test/thrust.example.constant_iterator.filecheck
new file mode 100644
index 000000000..53733577b
--- /dev/null
+++ b/internal/test/thrust.example.constant_iterator.filecheck
@@ -0,0 +1,4 @@
+     CHECK: 13
+CHECK-NEXT: 17
+CHECK-NEXT: 12
+CHECK-NEXT: 15
diff --git a/internal/test/thrust.example.counting_iterator.filecheck b/internal/test/thrust.example.counting_iterator.filecheck
new file mode 100644
index 000000000..b84601bbc
--- /dev/null
+++ b/internal/test/thrust.example.counting_iterator.filecheck
@@ -0,0 +1,5 @@
+     CHECK: found 4 nonzero values at indices:
+CHECK-NEXT: 1
+CHECK-NEXT: 2
+CHECK-NEXT: 5
+CHECK-NEXT: 7
diff --git a/internal/test/thrust.example.cuda.async_reduce.filecheck b/internal/test/thrust.example.cuda.async_reduce.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck b/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck
new file mode 100644
index 000000000..a1af14e69
--- /dev/null
+++ b/internal/test/thrust.example.cuda.custom_temporary_allocation.filecheck
@@ -0,0 +1,16 @@
+     CHECK: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): allocating new block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::allocate(): num_bytes == {{[0-9]+}}
+CHECK-NEXT: cached_allocator::allocate(): found a free block
+CHECK-NEXT: cached_allocator::deallocate(): ptr == {{(0x)?}}{{[0-9a-z]+}}
+CHECK-NEXT: cached_allocator::free_all()
diff --git a/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck
new file mode 100644
index 000000000..8b81c77d3
--- /dev/null
+++ b/internal/test/thrust.example.cuda.explicit_cuda_stream.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 499500
diff --git a/internal/test/thrust.example.cuda.fallback_allocator.filecheck b/internal/test/thrust.example.cuda.fallback_allocator.filecheck
new file mode 100644
index 000000000..535fc87fa
--- /dev/null
+++ b/internal/test/thrust.example.cuda.fallback_allocator.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Testing fallback_allocator on device
+CHECK-SAME: with {{[0-9]+}} bytes of device memory
+     CHECK: attempting to sort {{[0-9]+}} values
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK: attempting to sort {{[0-9]+}} values
+     CHECK:   allocated {{[0-9]+}} bytes of device memory
+     CHECK:   allocated {{[0-9]+}} bytes of pinned host memory (fallback successful)
diff --git a/internal/test/thrust.example.cuda.global_device_vector.filecheck b/internal/test/thrust.example.cuda.global_device_vector.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.cuda.range_view.filecheck b/internal/test/thrust.example.cuda.range_view.filecheck
new file mode 100644
index 000000000..83e3127d7
--- /dev/null
+++ b/internal/test/thrust.example.cuda.range_view.filecheck
@@ -0,0 +1,4 @@
+     CHECK: z[0]= 7
+CHECK-NEXT: z[1]= 8
+CHECK-NEXT: z[2]= 9
+CHECK-NEXT: z[3]= 10
diff --git a/internal/test/thrust.example.cuda.unwrap_pointer.filecheck b/internal/test/thrust.example.cuda.unwrap_pointer.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.cuda.wrap_pointer.filecheck b/internal/test/thrust.example.cuda.wrap_pointer.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.device_ptr.filecheck b/internal/test/thrust.example.device_ptr.filecheck
new file mode 100644
index 000000000..b02b51588
--- /dev/null
+++ b/internal/test/thrust.example.device_ptr.filecheck
@@ -0,0 +1,2 @@
+     CHECK: device array contains 10 values
+CHECK-NEXT: sum of values is 45
diff --git a/internal/test/thrust.example.discrete_voronoi.filecheck b/internal/test/thrust.example.discrete_voronoi.filecheck
new file mode 100644
index 000000000..3dbf65cf5
--- /dev/null
+++ b/internal/test/thrust.example.discrete_voronoi.filecheck
@@ -0,0 +1,11 @@
+     CHECK: [Inititialize {{[0-9]+}}x{{[0-9]+}} Image]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [Copy to Device]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [JFA stepping]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT:   ( {{[0-9.]+}} MPixel/s ) 
+CHECK-NEXT: [Device to Host Copy]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
+CHECK-NEXT: [PGM Export]
+CHECK-NEXT:   ( {{[0-9.]+}}ms )
diff --git a/internal/test/thrust.example.dot_products_with_zip.filecheck b/internal/test/thrust.example.dot_products_with_zip.filecheck
new file mode 100644
index 000000000..a8a1b3e3e
--- /dev/null
+++ b/internal/test/thrust.example.dot_products_with_zip.filecheck
@@ -0,0 +1,4 @@
+     CHECK: (0.000022,0.000022,0.000022) * (0.000022,0.000022,0.000022) = 0.000000
+CHECK-NEXT: (0.085032,0.085032,0.085032) * (0.085032,0.085032,0.085032) = 0.021692
+CHECK-NEXT: (0.601353,0.601353,0.601353) * (0.601353,0.601353,0.601353) = 1.084875
+CHECK-NEXT: (0.891611,0.891611,0.891611) * (0.891611,0.891611,0.891611) = 2.384912
diff --git a/internal/test/thrust.example.expand.filecheck b/internal/test/thrust.example.expand.filecheck
new file mode 100644
index 000000000..a43241087
--- /dev/null
+++ b/internal/test/thrust.example.expand.filecheck
@@ -0,0 +1,4 @@
+     CHECK: Expanding values according to counts
+CHECK-NEXT:  counts 3 5 2 0 1 3 4 2 4 
+CHECK-NEXT:  values 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT:  output 1 1 1 2 2 2 2 2 3 3 5 6 6 6 7 7 7 7 8 8 9 9 9 9 
diff --git a/internal/test/thrust.example.fill_copy_sequence.filecheck b/internal/test/thrust.example.fill_copy_sequence.filecheck
new file mode 100644
index 000000000..78f3acda2
--- /dev/null
+++ b/internal/test/thrust.example.fill_copy_sequence.filecheck
@@ -0,0 +1,10 @@
+     CHECK: D[0] = 0
+CHECK-NEXT: D[1] = 1
+CHECK-NEXT: D[2] = 2
+CHECK-NEXT: D[3] = 3
+CHECK-NEXT: D[4] = 4
+CHECK-NEXT: D[5] = 9
+CHECK-NEXT: D[6] = 9
+CHECK-NEXT: D[7] = 1
+CHECK-NEXT: D[8] = 1
+CHECK-NEXT: D[9] = 1
diff --git a/internal/test/thrust.example.histogram.filecheck b/internal/test/thrust.example.histogram.filecheck
new file mode 100644
index 000000000..bb5dbdba1
--- /dev/null
+++ b/internal/test/thrust.example.histogram.filecheck
@@ -0,0 +1,10 @@
+     CHECK: Dense Histogram
+CHECK-NEXT:           initial data  3 4 3 5 8 5 6 6 4 4 5 3 2 5 6 3 1 3 2 3 6 5 3 3 3 2 4 2 3 3 2 5 5 5 8 2 5 6 6 3 
+CHECK-NEXT:            sorted data  1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 8 8 
+CHECK-NEXT:   cumulative histogram  0 1 7 19 23 32 38 38 40 
+CHECK-NEXT:              histogram  0 1 6 12 4 9 6 0 2 
+CHECK-NEXT: Sparse Histogram
+CHECK-NEXT:           initial data  3 4 3 5 8 5 6 6 4 4 5 3 2 5 6 3 1 3 2 3 6 5 3 3 3 2 4 2 3 3 2 5 5 5 8 2 5 6 6 3 
+CHECK-NEXT:            sorted data  1 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 4 4 4 4 5 5 5 5 5 5 5 5 5 6 6 6 6 6 6 8 8 
+CHECK-NEXT:       histogram values  1 2 3 4 5 6 8 
+CHECK-NEXT:       histogram counts  1 6 12 4 9 6 2 
diff --git a/internal/test/thrust.example.lambda.filecheck b/internal/test/thrust.example.lambda.filecheck
new file mode 100644
index 000000000..2937024bb
--- /dev/null
+++ b/internal/test/thrust.example.lambda.filecheck
@@ -0,0 +1,10 @@
+     CHECK: SAXPY (functor method)
+CHECK-NEXT: 2 * 1 + 1 = 3
+CHECK-NEXT: 2 * 2 + 1 = 5
+CHECK-NEXT: 2 * 3 + 1 = 7
+CHECK-NEXT: 2 * 4 + 1 = 9
+CHECK-NEXT: SAXPY (placeholder method)
+CHECK-NEXT: 2 * 1 + 1 = 3
+CHECK-NEXT: 2 * 2 + 1 = 5
+CHECK-NEXT: 2 * 3 + 1 = 7
+CHECK-NEXT: 2 * 4 + 1 = 9
diff --git a/internal/test/thrust.example.lexicographical_sort.filecheck b/internal/test/thrust.example.lexicographical_sort.filecheck
new file mode 100644
index 000000000..7d2dc4907
--- /dev/null
+++ b/internal/test/thrust.example.lexicographical_sort.filecheck
@@ -0,0 +1,42 @@
+     CHECK: Unsorted Keys
+CHECK-NEXT: (0,2,6)
+CHECK-NEXT: (0,4,4)
+CHECK-NEXT: (6,8,5)
+CHECK-NEXT: (8,6,8)
+CHECK-NEXT: (9,9,4)
+CHECK-NEXT: (1,9,7)
+CHECK-NEXT: (5,1,0)
+CHECK-NEXT: (3,8,1)
+CHECK-NEXT: (2,9,2)
+CHECK-NEXT: (7,2,7)
+CHECK-NEXT: (0,9,0)
+CHECK-NEXT: (5,4,1)
+CHECK-NEXT: (5,3,6)
+CHECK-NEXT: (8,5,5)
+CHECK-NEXT: (5,3,7)
+CHECK-NEXT: (5,7,3)
+CHECK-NEXT: (8,6,4)
+CHECK-NEXT: (9,5,4)
+CHECK-NEXT: (7,5,9)
+CHECK-NEXT: (9,0,9)
+CHECK-NEXT: Sorted Keys
+CHECK-NEXT: (0,2,6)
+CHECK-NEXT: (0,4,4)
+CHECK-NEXT: (0,9,0)
+CHECK-NEXT: (1,9,7)
+CHECK-NEXT: (2,9,2)
+CHECK-NEXT: (3,8,1)
+CHECK-NEXT: (5,1,0)
+CHECK-NEXT: (5,3,6)
+CHECK-NEXT: (5,3,7)
+CHECK-NEXT: (5,4,1)
+CHECK-NEXT: (5,7,3)
+CHECK-NEXT: (6,8,5)
+CHECK-NEXT: (7,2,7)
+CHECK-NEXT: (7,5,9)
+CHECK-NEXT: (8,5,5)
+CHECK-NEXT: (8,6,4)
+CHECK-NEXT: (8,6,8)
+CHECK-NEXT: (9,0,9)
+CHECK-NEXT: (9,5,4)
+CHECK-NEXT: (9,9,4)
diff --git a/internal/test/thrust.example.max_abs_diff.filecheck b/internal/test/thrust.example.max_abs_diff.filecheck
new file mode 100644
index 000000000..a02df644f
--- /dev/null
+++ b/internal/test/thrust.example.max_abs_diff.filecheck
@@ -0,0 +1 @@
+     CHECK: maximum absolute difference: 4
diff --git a/internal/test/thrust.example.minimal_custom_backend.filecheck b/internal/test/thrust.example.minimal_custom_backend.filecheck
new file mode 100644
index 000000000..76802325b
--- /dev/null
+++ b/internal/test/thrust.example.minimal_custom_backend.filecheck
@@ -0,0 +1 @@
+     CHECK: Hello, world from for_each(my_system)!
diff --git a/internal/test/thrust.example.minmax.filecheck b/internal/test/thrust.example.minmax.filecheck
new file mode 100644
index 000000000..10e41724d
--- /dev/null
+++ b/internal/test/thrust.example.minmax.filecheck
@@ -0,0 +1,3 @@
+     CHECK: [ 10 17 64 90 97 27 56 45 33 76 ]
+CHECK-NEXT: minimum = 10
+CHECK-NEXT: maximum = 97
diff --git a/internal/test/thrust.example.mode.filecheck b/internal/test/thrust.example.mode.filecheck
new file mode 100644
index 000000000..c253cc483
--- /dev/null
+++ b/internal/test/thrust.example.mode.filecheck
@@ -0,0 +1,9 @@
+     CHECK: initial data
+CHECK-NEXT: 0 0 6 8 9 1 5 3 2 7 0 5 5 8 5 5 8 9 7 9 2 4 8 6 9 9 1 8 9 2 
+CHECK-NEXT: sorted data
+CHECK-NEXT: 0 0 0 1 1 2 2 2 3 4 5 5 5 5 5 6 6 7 7 8 8 8 8 8 9 9 9 9 9 9 
+CHECK-NEXT: values
+CHECK-NEXT: 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: counts
+CHECK-NEXT: 3 2 3 1 1 5 2 2 5 6 
+CHECK-NEXT: Modal value 9 occurs 6 times 
diff --git a/internal/test/thrust.example.monte_carlo.filecheck b/internal/test/thrust.example.monte_carlo.filecheck
new file mode 100644
index 000000000..137aec274
--- /dev/null
+++ b/internal/test/thrust.example.monte_carlo.filecheck
@@ -0,0 +1 @@
+     CHECK: pi is approximately 3.14
diff --git a/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck b/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck
new file mode 100644
index 000000000..8d6bd022b
--- /dev/null
+++ b/internal/test/thrust.example.monte_carlo_disjoint_sequences.filecheck
@@ -0,0 +1 @@
+     CHECK: pi is around 3.1415
diff --git a/internal/test/thrust.example.mr_basic.filecheck b/internal/test/thrust.example.mr_basic.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.norm.filecheck b/internal/test/thrust.example.norm.filecheck
new file mode 100644
index 000000000..8a8e4203e
--- /dev/null
+++ b/internal/test/thrust.example.norm.filecheck
@@ -0,0 +1 @@
+     CHECK: norm is 5.47723
diff --git a/internal/test/thrust.example.padded_grid_reduction.filecheck b/internal/test/thrust.example.padded_grid_reduction.filecheck
new file mode 100644
index 000000000..ed77e84fd
--- /dev/null
+++ b/internal/test/thrust.example.padded_grid_reduction.filecheck
@@ -0,0 +1,13 @@
+     CHECK: padded grid
+CHECK-NEXT:  0.2775 0.7256 0.6979 0.9412 0.4131 0.7202 0.3765 0.4136 0.5766 0.6612 0.4672 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.0137 0.6256 0.1003 0.2374 0.0915 0.0455 0.3187 0.0839 0.8173 0.7281 0.5975 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.2990 0.2693 0.4408 0.1262 0.3812 0.8537 0.9962 0.7528 0.9272 0.7873 0.8984 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.3529 0.5803 0.8900 0.4505 0.0477 0.2683 0.8613 0.0877 0.2438 0.4363 0.6292 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.4561 0.7896 0.6662 0.4988 0.4404 0.6277 0.5752 0.6816 0.1240 0.5018 0.8027 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9527 0.5223 0.9500 0.2376 0.0110 0.7803 0.6221 0.2488 0.7006 0.6347 0.9137 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.0027 0.4972 0.7421 0.4674 0.8961 0.2355 0.9507 0.9211 0.1650 0.4517 0.7143 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.8649 0.2082 0.8464 0.2547 0.4789 0.9534 0.0403 0.6872 0.8964 0.3910 0.2292 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9017 0.1525 0.9041 0.1460 0.1646 0.3839 0.6994 0.0900 0.1671 0.2587 0.5893 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+CHECK-NEXT:  0.9075 0.2186 0.4626 0.8713 0.7073 0.1520 0.9495 0.4137 0.6746 0.7064 0.5609 -1.0000 -1.0000 -1.0000 -1.0000 -1.0000 
+     CHECK: minimum value: 0.0027
+CHECK-NEXT: maximum value: 0.9962
diff --git a/internal/test/thrust.example.permutation_iterator.filecheck b/internal/test/thrust.example.permutation_iterator.filecheck
new file mode 100644
index 000000000..6507af04b
--- /dev/null
+++ b/internal/test/thrust.example.permutation_iterator.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 130
diff --git a/internal/test/thrust.example.raw_reference_cast.filecheck b/internal/test/thrust.example.raw_reference_cast.filecheck
new file mode 100644
index 000000000..ed23222e9
--- /dev/null
+++ b/internal/test/thrust.example.raw_reference_cast.filecheck
@@ -0,0 +1,6 @@
+     CHECK: Before A->B Copy
+CHECK-NEXT: A: 0 1 2 3 4 
+CHECK-NEXT: B: 0 0 0 0 0 
+CHECK-NEXT: After A->B Copy
+CHECK-NEXT: A: 0 1 2 3 4 
+CHECK-NEXT: B: 0 1 2 3 4 
diff --git a/internal/test/thrust.example.remove_points2d.filecheck b/internal/test/thrust.example.remove_points2d.filecheck
new file mode 100644
index 000000000..f69f1cd52
--- /dev/null
+++ b/internal/test/thrust.example.remove_points2d.filecheck
@@ -0,0 +1,36 @@
+     CHECK: Generated 20 points
+CHECK-NEXT: (0.000022,0.085032)
+CHECK-NEXT: (0.601353,0.891611)
+CHECK-NEXT: (0.967956,0.189690)
+CHECK-NEXT: (0.514976,0.398008)
+CHECK-NEXT: (0.262906,0.743512)
+CHECK-NEXT: (0.089548,0.560390)
+CHECK-NEXT: (0.582230,0.809567)
+CHECK-NEXT: (0.591919,0.511713)
+CHECK-NEXT: (0.876634,0.995085)
+CHECK-NEXT: (0.726212,0.966611)
+CHECK-NEXT: (0.297102,0.426051)
+CHECK-NEXT: (0.899498,0.652999)
+CHECK-NEXT: (0.901534,0.961533)
+CHECK-NEXT: (0.164713,0.857987)
+CHECK-NEXT: (0.906845,0.294026)
+CHECK-NEXT: (0.936244,0.414645)
+CHECK-NEXT: (0.308457,0.514893)
+CHECK-NEXT: (0.395430,0.789785)
+CHECK-NEXT: (0.689141,0.544273)
+CHECK-NEXT: (0.592407,0.093630)
+     CHECK: After stream compaction, 14 points remain
+CHECK-NEXT: (0.000022,0.085032)
+CHECK-NEXT: (0.967956,0.189690)
+CHECK-NEXT: (0.514976,0.398008)
+CHECK-NEXT: (0.262906,0.743512)
+CHECK-NEXT: (0.089548,0.560390)
+CHECK-NEXT: (0.582230,0.809567)
+CHECK-NEXT: (0.591919,0.511713)
+CHECK-NEXT: (0.297102,0.426051)
+CHECK-NEXT: (0.164713,0.857987)
+CHECK-NEXT: (0.906845,0.294026)
+CHECK-NEXT: (0.308457,0.514893)
+CHECK-NEXT: (0.395430,0.789785)
+CHECK-NEXT: (0.689141,0.544273)
+CHECK-NEXT: (0.592407,0.093630)
diff --git a/internal/test/thrust.example.repeated_range.filecheck b/internal/test/thrust.example.repeated_range.filecheck
new file mode 100644
index 000000000..e067aed99
--- /dev/null
+++ b/internal/test/thrust.example.repeated_range.filecheck
@@ -0,0 +1,3 @@
+     CHECK: range        10 20 30 40 
+CHECK-NEXT: repeated x2: 10 10 20 20 30 30 40 40 
+CHECK-NEXT: repeated x3: 10 10 10 20 20 20 30 30 30 40 40 40 
diff --git a/internal/test/thrust.example.run_length_decoding.filecheck b/internal/test/thrust.example.run_length_decoding.filecheck
new file mode 100644
index 000000000..49faef7fc
--- /dev/null
+++ b/internal/test/thrust.example.run_length_decoding.filecheck
@@ -0,0 +1,4 @@
+     CHECK: run-length encoded input:
+CHECK-NEXT: (a,3)(b,5)(c,1)(d,2)(e,9)(f,2)
+     CHECK: decoded output:
+CHECK-NEXT: aaabbbbbcddeeeeeeeeeff
diff --git a/internal/test/thrust.example.run_length_encoding.filecheck b/internal/test/thrust.example.run_length_encoding.filecheck
new file mode 100644
index 000000000..7d907ab79
--- /dev/null
+++ b/internal/test/thrust.example.run_length_encoding.filecheck
@@ -0,0 +1,4 @@
+     CHECK: input data:
+CHECK-NEXT: aaabbbbbcddeeeeeeeeeff
+     CHECK: run-length encoded output:
+CHECK-NEXT: (a,3)(b,5)(c,1)(d,2)(e,9)(f,2)
diff --git a/internal/test/thrust.example.saxpy.filecheck b/internal/test/thrust.example.saxpy.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.scan_by_key.filecheck b/internal/test/thrust.example.scan_by_key.filecheck
new file mode 100644
index 000000000..b183794b0
--- /dev/null
+++ b/internal/test/thrust.example.scan_by_key.filecheck
@@ -0,0 +1,16 @@
+     CHECK: Inclusive Segmented Scan w/ Key Sequence
+CHECK-NEXT:  keys          : 0 0 0 1 1 2 2 2 2 3 4 4 5 5 5 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 2 4 6 2 4 2 4 6 8 2 2 4 2 4 6 
+     CHECK: Inclusive Segmented Scan w/ Head Flag Sequence
+CHECK-NEXT:  head flags    : 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 2 4 6 2 4 2 4 6 8 2 2 4 2 4 6 
+     CHECK: Exclusive Segmented Scan w/ Key Sequence
+CHECK-NEXT:  keys          : 0 0 0 1 1 2 2 2 2 3 4 4 5 5 5 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 0 2 4 0 2 0 2 4 6 0 0 2 0 2 4 
+     CHECK: Exclusive Segmented Scan w/ Head Flag Sequence
+CHECK-NEXT:  head flags    : 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 
+CHECK-NEXT:  input values  : 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 
+CHECK-NEXT:  output values : 0 2 4 0 2 0 2 4 6 0 0 2 0 2 4 
diff --git a/internal/test/thrust.example.scan_matrix_by_rows.filecheck b/internal/test/thrust.example.scan_matrix_by_rows.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.set_operations.filecheck b/internal/test/thrust.example.set_operations.filecheck
new file mode 100644
index 000000000..6ccfe8beb
--- /dev/null
+++ b/internal/test/thrust.example.set_operations.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Set A [ 0 2 4 5 6 8 9 ]
+CHECK-NEXT: Set B [ 0 1 2 3 5 7 8 ]
+CHECK-NEXT: Merge(A,B) [ 0 0 1 2 2 3 4 5 5 6 7 8 8 9 ]
+CHECK-NEXT: Union(A,B) [ 0 1 2 3 4 5 6 7 8 9 ]
+CHECK-NEXT: Intersection(A,B) [ 0 2 5 8 ]
+CHECK-NEXT: Difference(A,B) [ 4 6 9 ]
+CHECK-NEXT: SymmetricDifference(A,B) [ 1 3 4 6 7 9 ]
+CHECK-NEXT: SetIntersectionSize(A,B) 4
diff --git a/internal/test/thrust.example.simple_moving_average.filecheck b/internal/test/thrust.example.simple_moving_average.filecheck
new file mode 100644
index 000000000..4fadc201c
--- /dev/null
+++ b/internal/test/thrust.example.simple_moving_average.filecheck
@@ -0,0 +1,29 @@
+     CHECK: data series: [ 0 0 6 9 10 2 5 4 2 8 0 6 6 8 6 5 9 10 7 10 3 4 9 7 9 10 1 9 9 3 ]
+CHECK-NEXT: simple moving averages (window = 4)
+CHECK-NEXT:   [ 0, 4) = 3.75
+CHECK-NEXT:   [ 1, 5) = 6.25
+CHECK-NEXT:   [ 2, 6) = 6.75
+CHECK-NEXT:   [ 3, 7) = 6.5
+CHECK-NEXT:   [ 4, 8) = 5.25
+CHECK-NEXT:   [ 5, 9) = 3.25
+CHECK-NEXT:   [ 6,10) = 4.75
+CHECK-NEXT:   [ 7,11) = 3.5
+CHECK-NEXT:   [ 8,12) = 4
+CHECK-NEXT:   [ 9,13) = 5
+CHECK-NEXT:   [10,14) = 5
+CHECK-NEXT:   [11,15) = 6.5
+CHECK-NEXT:   [12,16) = 6.25
+CHECK-NEXT:   [13,17) = 7
+CHECK-NEXT:   [14,18) = 7.5
+CHECK-NEXT:   [15,19) = 7.75
+CHECK-NEXT:   [16,20) = 9
+CHECK-NEXT:   [17,21) = 7.5
+CHECK-NEXT:   [18,22) = 6
+CHECK-NEXT:   [19,23) = 6.5
+CHECK-NEXT:   [20,24) = 5.75
+CHECK-NEXT:   [21,25) = 7.25
+CHECK-NEXT:   [22,26) = 8.75
+CHECK-NEXT:   [23,27) = 6.75
+CHECK-NEXT:   [24,28) = 7.25
+CHECK-NEXT:   [25,29) = 7.25
+CHECK-NEXT:   [26,30) = 5.5
diff --git a/internal/test/thrust.example.sort.filecheck b/internal/test/thrust.example.sort.filecheck
new file mode 100644
index 000000000..b6450f88d
--- /dev/null
+++ b/internal/test/thrust.example.sort.filecheck
@@ -0,0 +1,21 @@
+     CHECK: sorting integers
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  16 28 40 40 54 57 62 77 78 78 79 86 87 93 94 98
+     CHECK: sorting integers (descending)
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  98 94 93 87 86 79 78 78 77 62 57 54 40 40 28 16
+     CHECK: sorting integers (user-defined comparison)
+CHECK-NEXT:  79 78 62 78 94 40 86 57 40 16 28 54 77 87 93 98
+CHECK-NEXT:  16 28 40 40 54 62 78 78 86 94 98 57 77 79 87 93
+     CHECK: sorting floats
+CHECK-NEXT:  7.5 7.5 6.0 7.5 9.0 4.0 8.5 5.5 4.0 1.5 2.5 5.0 7.5 8.5 9.0 9.5
+CHECK-NEXT:  1.5 2.5 4.0 4.0 5.0 5.5 6.0 7.5 7.5 7.5 7.5 8.5 8.5 9.0 9.0 9.5
+     CHECK: sorting pairs
+CHECK-NEXT:  (7,7) (5,7) (9,3) (8,5) (3,0) (2,4) (7,8) (9,9) (7,1) (1,9) (0,5) (3,6) (8,0) (7,6) (4,2) (8,3)
+CHECK-NEXT:  (0,5) (1,9) (2,4) (3,0) (3,6) (4,2) (5,7) (7,1) (7,6) (7,7) (7,8) (8,0) (8,3) (8,5) (9,3) (9,9)
+     CHECK: key-value sorting
+CHECK-NEXT:  (79, 0) (78, 1) (62, 2) (78, 3) (94, 4) (40, 5) (86, 6) (57, 7) (40, 8) (16, 9) (28,10) (54,11) (77,12) (87,13) (93,14) (98,15)
+CHECK-NEXT:  (16, 9) (28,10) (40, 5) (40, 8) (54,11) (57, 7) (62, 2) (77,12) (78, 1) (78, 3) (79, 0) (86, 6) (87,13) (93,14) (94, 4) (98,15)
+     CHECK: key-value sorting (descending)
+CHECK-NEXT:  (79, 0) (78, 1) (62, 2) (78, 3) (94, 4) (40, 5) (86, 6) (57, 7) (40, 8) (16, 9) (28,10) (54,11) (77,12) (87,13) (93,14) (98,15)
+CHECK-NEXT:  (98,15) (94, 4) (93,14) (87,13) (86, 6) (79, 0) (78, 1) (78, 3) (77,12) (62, 2) (57, 7) (54,11) (40, 5) (40, 8) (28,10) (16, 9)
diff --git a/internal/test/thrust.example.sorting_aos_vs_soa.filecheck b/internal/test/thrust.example.sorting_aos_vs_soa.filecheck
new file mode 100644
index 000000000..f29323710
--- /dev/null
+++ b/internal/test/thrust.example.sorting_aos_vs_soa.filecheck
@@ -0,0 +1,2 @@
+     CHECK: AoS sort took {{[0-9.]+}} milliseconds
+CHECK-NEXT: SoA sort took {{[0-9.]+}} milliseconds
diff --git a/internal/test/thrust.example.sparse_vector.filecheck b/internal/test/thrust.example.sparse_vector.filecheck
new file mode 100644
index 000000000..560378d3c
--- /dev/null
+++ b/internal/test/thrust.example.sparse_vector.filecheck
@@ -0,0 +1,4 @@
+     CHECK: Computing C = A + B for sparse vectors A and B
+CHECK-NEXT: A (2,10) (3,60) (5,20) (8,40) 
+CHECK-NEXT: B (1,50) (2,30) (4,80) (5,30) (7,90) (8,10) 
+CHECK-NEXT: C (1,50) (2,40) (3,60) (4,80) (5,50) (7,90) (8,50) 
diff --git a/internal/test/thrust.example.stream_compaction.filecheck b/internal/test/thrust.example.stream_compaction.filecheck
new file mode 100644
index 000000000..eb62ac24c
--- /dev/null
+++ b/internal/test/thrust.example.stream_compaction.filecheck
@@ -0,0 +1,4 @@
+     CHECK: values: 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: output: 1 3 5 7 9 
+CHECK-NEXT: small_output: 1 3 5 7 9 
+CHECK-NEXT: values: 0 2 4 6 8 
diff --git a/internal/test/thrust.example.strided_range.filecheck b/internal/test/thrust.example.strided_range.filecheck
new file mode 100644
index 000000000..2067ffa17
--- /dev/null
+++ b/internal/test/thrust.example.strided_range.filecheck
@@ -0,0 +1,4 @@
+     CHECK: data: 10 20 30 40 50 60 70 80 
+CHECK-NEXT: sum of even indices: 160
+CHECK-NEXT: sum of odd indices:  200
+CHECK-NEXT: setting odd indices to zero: 10 0 30 0 50 0 70 0 
diff --git a/internal/test/thrust.example.sum.filecheck b/internal/test/thrust.example.sum.filecheck
new file mode 100644
index 000000000..4c7771103
--- /dev/null
+++ b/internal/test/thrust.example.sum.filecheck
@@ -0,0 +1 @@
+     CHECK: sum is 509773
diff --git a/internal/test/thrust.example.sum_rows.filecheck b/internal/test/thrust.example.sum_rows.filecheck
new file mode 100644
index 000000000..ae5f889d7
--- /dev/null
+++ b/internal/test/thrust.example.sum_rows.filecheck
@@ -0,0 +1,5 @@
+     CHECK: [ 10 17 64 90 97 27 56 45 ] = 406
+CHECK-NEXT: [ 33 76 18 60 62 82 63 56 ] = 450
+CHECK-NEXT: [ 88 99 75 96 36 48 90 68 ] = 600
+CHECK-NEXT: [ 91 96 24 87 91 36 94 47 ] = 566
+CHECK-NEXT: [ 37 56 45 81 72 58 63 18 ] = 430
diff --git a/internal/test/thrust.example.summary_statistics.filecheck b/internal/test/thrust.example.summary_statistics.filecheck
new file mode 100644
index 000000000..92c2470ea
--- /dev/null
+++ b/internal/test/thrust.example.summary_statistics.filecheck
@@ -0,0 +1,10 @@
+     CHECK: ******Summary Statistics Example*****
+CHECK-NEXT: The data: 4 7 13 16 
+CHECK-NEXT: Count              : 4
+CHECK-NEXT: Minimum            : 4
+CHECK-NEXT: Maximum            : 16
+CHECK-NEXT: Mean               : 10
+CHECK-NEXT: Variance           : 30
+CHECK-NEXT: Standard Deviation : 4.74342
+CHECK-NEXT: Skewness           : 0
+CHECK-NEXT: Kurtosis           : 1.36
diff --git a/internal/test/thrust.example.summed_area_table.filecheck b/internal/test/thrust.example.summed_area_table.filecheck
new file mode 100644
index 000000000..98fabffca
--- /dev/null
+++ b/internal/test/thrust.example.summed_area_table.filecheck
@@ -0,0 +1,22 @@
+     CHECK: [step 0] initial array
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT:        1        1        1        1 
+CHECK-NEXT: [step 1] scan horizontally
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT: [step 2] transpose array
+CHECK-NEXT:        1        1        1 
+CHECK-NEXT:        2        2        2 
+CHECK-NEXT:        3        3        3 
+CHECK-NEXT:        4        4        4 
+CHECK-NEXT: [step 3] scan transpose horizontally
+CHECK-NEXT:        1        2        3 
+CHECK-NEXT:        2        4        6 
+CHECK-NEXT:        3        6        9 
+CHECK-NEXT:        4        8       12 
+CHECK-NEXT: [step 4] transpose the transpose
+CHECK-NEXT:        1        2        3        4 
+CHECK-NEXT:        2        4        6        8 
+CHECK-NEXT:        3        6        9       12 
diff --git a/internal/test/thrust.example.tiled_range.filecheck b/internal/test/thrust.example.tiled_range.filecheck
new file mode 100644
index 000000000..2ac310b51
--- /dev/null
+++ b/internal/test/thrust.example.tiled_range.filecheck
@@ -0,0 +1,3 @@
+     CHECK: range        10 20 30 40 
+CHECK-NEXT: two tiles:   10 20 30 40 10 20 30 40 
+CHECK-NEXT: three tiles: 10 20 30 40 10 20 30 40 10 20 30 40 
diff --git a/internal/test/thrust.example.transform_input_output_iterator.filecheck b/internal/test/thrust.example.transform_input_output_iterator.filecheck
new file mode 100644
index 000000000..caeca2de5
--- /dev/null
+++ b/internal/test/thrust.example.transform_input_output_iterator.filecheck
@@ -0,0 +1,2 @@
+     CHECK: Expected [ 1050 2060 3070 4080 ]
+CHECK-NEXT: Result   [ 1050 2060 3070 4080 ]
diff --git a/internal/test/thrust.example.transform_iterator.filecheck b/internal/test/thrust.example.transform_iterator.filecheck
new file mode 100644
index 000000000..8d3a4f852
--- /dev/null
+++ b/internal/test/thrust.example.transform_iterator.filecheck
@@ -0,0 +1,7 @@
+     CHECK: values         : 2 5 7 1 6 0 3 8 
+CHECK-NEXT: clamped values : 2 5 5 1 5 1 3 5 
+CHECK-NEXT: sum of clamped values : 27
+CHECK-NEXT: sequence         : 0 1 2 3 4 5 6 7 8 9 
+CHECK-NEXT: clamped sequence : 1 1 2 3 4 5 5 5 5 5 
+CHECK-NEXT: negated sequence : -1 -1 -2 -3 -4 -5 -5 -5 -5 -5 
+CHECK-NEXT: negated values : -2 -5 -7 -1 -6 0 -3 -8 
diff --git a/internal/test/thrust.example.transform_output_iterator.filecheck b/internal/test/thrust.example.transform_output_iterator.filecheck
new file mode 100644
index 000000000..e1e4a92b5
--- /dev/null
+++ b/internal/test/thrust.example.transform_output_iterator.filecheck
@@ -0,0 +1 @@
+     CHECK: result= [ -0.666667 -2.66667 2 ] 
diff --git a/internal/test/thrust.example.uninitialized_vector.filecheck b/internal/test/thrust.example.uninitialized_vector.filecheck
new file mode 100644
index 000000000..e69de29bb
diff --git a/internal/test/thrust.example.version.filecheck b/internal/test/thrust.example.version.filecheck
new file mode 100644
index 000000000..89b4d664a
--- /dev/null
+++ b/internal/test/thrust.example.version.filecheck
@@ -0,0 +1 @@
+     CHECK: Thrust v{{[0-9]+[.][0-9]+[.][0-9]+-[0-9]+}}
diff --git a/internal/test/thrust.example.weld_vertices.filecheck b/internal/test/thrust.example.weld_vertices.filecheck
new file mode 100644
index 000000000..a206e1f62
--- /dev/null
+++ b/internal/test/thrust.example.weld_vertices.filecheck
@@ -0,0 +1,15 @@
+     CHECK: Output Representation
+CHECK-NEXT:  vertices[0] = (0,0)
+CHECK-NEXT:  vertices[1] = (0,1)
+CHECK-NEXT:  vertices[2] = (1,0)
+CHECK-NEXT:  vertices[3] = (1,1)
+CHECK-NEXT:  vertices[4] = (2,0)
+CHECK-NEXT:  indices[0] = 0
+CHECK-NEXT:  indices[1] = 2
+CHECK-NEXT:  indices[2] = 1
+CHECK-NEXT:  indices[3] = 2
+CHECK-NEXT:  indices[4] = 3
+CHECK-NEXT:  indices[5] = 1
+CHECK-NEXT:  indices[6] = 2
+CHECK-NEXT:  indices[7] = 4
+CHECK-NEXT:  indices[8] = 3
diff --git a/internal/test/thrust.example.word_count.filecheck b/internal/test/thrust.example.word_count.filecheck
new file mode 100644
index 000000000..e21beabd7
--- /dev/null
+++ b/internal/test/thrust.example.word_count.filecheck
@@ -0,0 +1,8 @@
+     CHECK: Text sample:
+CHECK-NEXT:   But the raven, sitting lonely on the placid bust, spoke only,
+CHECK-NEXT:   That one word, as if his soul in that one word he did outpour.
+CHECK-NEXT:   Nothing further then he uttered - not a feather then he fluttered -
+CHECK-NEXT:   Till I scarcely more than muttered `Other friends have flown before -
+CHECK-NEXT:   On the morrow he will leave me, as my hopes have flown before.'
+CHECK-NEXT:   Then the bird said, `Nevermore.'
+     CHECK: Text sample contains 65 words
diff --git a/internal/test/thrust.smoke.filecheck b/internal/test/thrust.smoke.filecheck
new file mode 100644
index 000000000..6906f6d86
--- /dev/null
+++ b/internal/test/thrust.smoke.filecheck
@@ -0,0 +1 @@
+     CHECK: SMOKE
diff --git a/internal/test/thrust_nightly.pl b/internal/test/thrust_nightly.pl
new file mode 100755
index 000000000..ab5815111
--- /dev/null
+++ b/internal/test/thrust_nightly.pl
@@ -0,0 +1,601 @@
+#! /usr/bin/perl
+
+###############################################################################
+# Copyright (c) 2018 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###############################################################################
+
+use strict;
+use warnings;
+
+print(`perl --version`);
+
+use Getopt::Long;
+use Cwd;
+use Cwd "abs_path";
+use Config; # For signal names and numbers.
+use IPC::Open2;
+use File::Temp;
+use POSIX "strftime";
+
+my $have_time_hi_res = 0;
+
+if (eval { require Time::HiRes })
+{
+  printf("#### CONFIG timestamp `gettimeofday`\n");
+
+  import Time::HiRes "gettimeofday";
+
+  $have_time_hi_res = 1;
+} else {
+  printf("#### CONFIG timestamp `time`\n");
+}
+
+sub timestamp()
+{
+  if ($have_time_hi_res) {
+    return gettimeofday();
+  } else {
+    return time();
+  }
+}
+
+my %CmdLineOption;
+my $arch                = "";
+my $abi                 = "";
+my $os                  = "";
+my $build               = "release";
+my $bin_path;
+my $filecheck_path;
+my $filecheck_data_path = "internal/test";
+my $timeout_min         = 15;
+
+# https://stackoverflow.com/questions/29862178/name-of-signal-number-2
+my @sig_names;
+@sig_names[ split ' ', $Config{sig_num} ] = split ' ', $Config{sig_name};
+my %sig_nums;
+@sig_nums{ split ' ', $Config{sig_name} } = split ' ', $Config{sig_num};
+
+if (`uname` =~ m/CYGWIN/) {
+  $os = "win32";
+} elsif ($^O eq "MSWin32") {
+  $os = "win32";
+} else {
+  $os = `uname`;
+  chomp($os);
+}
+
+if ($os eq "win32") {
+  $ENV{'PROCESSOR_ARCHITECTURE'} ||= "";
+  $ENV{'PROCESSOR_ARCHITEW6432'} ||= "";
+
+  if ((lc($ENV{PROCESSOR_ARCHITECTURE}) ne "x86") ||
+      (lc($ENV{PROCESSOR_ARCHITECTURE}) eq "amd64") ||
+      (lc($ENV{PROCESSOR_ARCHITEW6432}) eq "amd64")) {
+    $arch = "x86_64";
+  } else {
+    $arch = "i686";
+  }
+} else {
+  $arch = `uname -m`;
+  chomp($arch);
+}
+
+sub usage()
+{
+  printf("Usage: thrust_nightly.pl <options>\n");
+  printf("Options:\n");
+  printf("  -help                         : Print help message\n");
+  printf("  -forcearch <arch>             : i686|x86_64|ARMv7|aarch64 (default: $arch)\n");
+  printf("  -forceabi <abi>               : Specify abi to be used for arm (gnueabi|gnueabihf)\n");
+  printf("  -forceos <os>                 : win32|Linux|Darwin (default: $os)\n");
+  printf("  -build <release|debug>        : (default: debug)\n");
+  printf("  -bin-path <path>              : Specify location of test binaries\n");
+  printf("  -filecheck-path <path>        : Specify location of filecheck binary\n");
+  printf("  -filecheck-data-path <path>   : Specify location of filecheck data (default: $filecheck_data_path)\n");
+  printf("  -timeout-min <min>            : timeout in minutes for each individual test\n");
+}
+
+GetOptions(\%CmdLineOption,
+           'help' => sub { usage() and exit 0 },
+           "forcearch=s" => \$arch,
+           "forceabi=s" => \$abi,
+           "forceos=s" => \$os,
+           "build=s" => \$build,
+           "bin-path=s" => \$bin_path,
+           "filecheck-path=s" => \$filecheck_path,
+           "filecheck-data-path=s" => \$filecheck_data_path,
+           "timeout-min=i" => \$timeout_min,
+          );
+
+my $pwd = getcwd();
+my $bin_path_root = abs_path ("${pwd}/..");
+
+if ($arch eq "ARMv7") {
+      if ($abi eq "") {
+          $abi = "_gnueabi";  #Use default abi for arm if not specified
+      }
+      else {
+          $abi = "_${abi}";
+      }
+}
+else {
+    $abi = "";                #Ignore abi for architectures other than arm
+}
+
+my $uname = "";
+$uname = $arch;
+chomp($uname);
+
+if (not $bin_path) {
+    $bin_path = "${bin_path_root}/bin/${uname}_${os}${abi}_${build}";
+}
+
+if (not $filecheck_path) {
+    $filecheck_path = "${bin_path}/nvvm/tools";
+}
+
+sub process_return_code {
+    my ($name, $ret, $msg) = @_;
+
+    if ($ret != 0) {
+        my $signal  = $ret & 127;
+        my $app_exit = $ret >> 8;
+        my $dumped_core = $ret & 0x80;
+        if (($app_exit != 0) && ($app_exit != 0)) {
+            if ($msg ne "") {
+                printf("#### ERROR $name exited with return value $app_exit. $msg\n");
+            } else {
+                printf("#### ERROR $name exited with return value $app_exit.\n");
+            }
+        }
+        if ($signal != 0) {
+            if ($msg ne "") {
+                printf("#### ERROR $name received signal SIG$sig_names[$signal] ($signal). $msg\n");
+            } else {
+                printf("#### ERROR $name received signal SIG$sig_names[$signal] ($signal).\n");
+            }
+            if ($sig_nums{'INT'} eq $signal) {
+                die("Terminating testing due to SIGINT.");
+            }
+        }
+        if ($dumped_core != 0) {
+            if ($msg ne "") {
+                printf("#### ERROR $name generated a core dump. $msg\n");
+            } else {
+                printf("#### ERROR $name generated a core dump.\n");
+            }
+        }
+    }
+}
+
+my $have_filecheck = 1;
+
+sub filecheck_smoke_test {
+    my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/thrust.smoke.filecheck";
+
+    my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+    print $filecheck_stdin "SMOKE";
+
+    my $filecheck_ret = 0;
+    if (close($filecheck_stdin) == 0)
+    {
+      $filecheck_ret = $?;
+    }
+
+    if ($filecheck_ret == 0) {
+      printf("&&&& PASSED FileCheck\n");
+    } else {
+      # Use a temporary file to send the output to
+      # FileCheck so we can get the output this time,
+      # because Perl and bidirectional pipes suck.
+      my $tmp = File::Temp->new();
+      my $tmp_filename = $tmp->filename;
+      print $tmp "SMOKE";
+
+      printf("********************************************************************************\n");
+      print `$filecheck_cmd -input-file $tmp_filename`;
+      printf("********************************************************************************\n");
+
+      process_return_code("FileCheck Test", $filecheck_ret, "");
+      printf("&&&& FAILED FileCheck\n");
+
+      $have_filecheck = 0;
+    }
+}
+
+# Wrapper for system that logs the commands so you can see what it did
+sub run_cmd {
+    my ($cmd) = @_;
+    my $ret = 0;
+    my @executable;
+    my @output;
+    my $syst_cmd;
+
+    my $start = timestamp();
+    eval {
+        local $SIG{ALRM} = sub { die("Command timed out (received SIGALRM).\n") };
+        alarm (60 * $timeout_min);
+        $syst_cmd = $cmd;
+
+        @executable = split(' ', $syst_cmd, 2);
+
+        open(my $child, "-|", "$syst_cmd") or die("Could not execute $syst_cmd.\n");
+
+        if ($child)
+        {
+          @output = <$child>;
+        }
+
+        if (close($child) == 0)
+        {
+          $ret = $?;
+        }
+
+        alarm 0;
+    };
+    my $elapsed = timestamp() - $start;
+
+    if ($@) {
+        printf("\n#### ERROR Command timeout reached, killing $executable[0].\n");
+        system("killall ".$executable[0]);
+        return ($sig_nums{'KILL'}, $elapsed, @output);
+    }
+
+    return ($ret, $elapsed, @output);
+}
+
+sub current_time
+{
+   return strftime("%x %X %Z", localtime());
+}
+
+my $failures = 0;
+my $known_failures = 0;
+my $errors = 0;
+my $passes = 0;
+
+sub run_examples {
+    # Get list of tests in binary folder.
+    my $dir = cwd();
+    chdir $bin_path;
+    my @examplelist;
+    if ($os eq "win32")
+    {
+        @examplelist = glob('thrust.example.*.exe');
+    } else {
+        @examplelist = glob('thrust.example.*');
+    }
+
+    chdir $dir;
+
+    my $test;
+    foreach $test (@examplelist)
+    {
+        my $test_exe = $test;
+
+        # Ignore FileCheck files.
+        if ($test =~ /[.]filecheck$/)
+        {
+          next;
+        }
+
+        if ($os eq "win32")
+        {
+          $test =~ s/\.exe//g;
+        }
+
+        # Check the test actually exists.
+        if (!-e "${bin_path}/${test_exe}")
+        {
+          next;
+        }
+
+        my $cmd = "${bin_path}/${test_exe} --verbose 2>&1";
+
+        printf("&&&& RUNNING $test\n");
+        printf("#### CURRENT_TIME " . current_time() . "\n");
+
+        my ($ret, $elapsed, @output) = run_cmd($cmd);
+
+        printf("********************************************************************************\n");
+        print @output;
+        printf("********************************************************************************\n");
+
+        if ($ret != 0) {
+            process_return_code($test, $ret, "Example crash?");
+            printf("&&&& FAILED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            $errors = $errors + 1;
+        } else {
+            printf("&&&& PASSED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            $passes = $passes + 1;
+
+            if ($have_filecheck) {
+                # Check output with LLVM FileCheck.
+
+                printf("&&&& RUNNING FileCheck $test\n");
+
+                if (-f "${filecheck_data_path}/${test}.filecheck") {
+                    # If the filecheck file is empty, don't use filecheck, just
+                    # check if the output file is also empty.
+                    if (-z "${filecheck_data_path}/${test}.filecheck") {
+                        if (join("", @output) eq "") {
+                            printf("&&&& PASSED FileCheck $test\n");
+                            $passes = $passes + 1;
+                        } else {
+                            printf("#### ERROR Output received but not expected.\n");
+                            printf("&&&& FAILED FileCheck $test\n");
+                            $failures = $failures + 1;
+                        }
+                    } else {
+                        my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/$test.filecheck";
+
+                        my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+                        print $filecheck_stdin @output;
+
+                        my $filecheck_ret = 0;
+                        if (close($filecheck_stdin) == 0)
+                        {
+                          $filecheck_ret = $?;
+                        }
+
+                        if ($filecheck_ret == 0) {
+                          printf("&&&& PASSED FileCheck $test\n");
+                          $passes = $passes + 1;
+                        } else {
+                          # Use a temporary file to send the output to
+                          # FileCheck so we can get the output this time,
+                          # because Perl and bidirectional pipes suck.
+                          my $tmp = File::Temp->new();
+                          my $tmp_filename = $tmp->filename;
+                          print $tmp @output;
+
+                          printf("********************************************************************************\n");
+                          print `$filecheck_cmd -input-file $tmp_filename`;
+                          printf("********************************************************************************\n");
+
+                          process_return_code("FileCheck $test", $filecheck_ret, "");
+                          printf("&&&& FAILED FileCheck $test\n");
+                          $failures = $failures + 1;
+                        }
+                    }
+                } else {
+                    printf("#### ERROR $test has no FileCheck comparison.\n");
+                    printf("&&&& FAILED FileCheck $test\n");
+                    $errors = $errors + 1;
+                }
+            }
+        }
+        printf("\n");
+    }
+}
+
+sub run_unit_tests {
+    # Get list of tests in binary folder.
+    my $dir = cwd();
+    chdir $bin_path;
+    my @unittestlist;
+    if ($os eq "win32")
+    {
+        @unittestlist = glob('thrust.test.*.exe');
+    } else {
+        @unittestlist = glob('thrust.test.*');
+    }
+    chdir $dir;
+
+    my $test;
+    foreach $test (@unittestlist)
+    {
+        my $test_exe = $test;
+
+        # Ignore FileCheck files.
+        if ($test =~ /[.]filecheck$/)
+        {
+          next;
+        }
+
+        if ($os eq "win32")
+        {
+          $test =~ s/\.exe//g;
+        }
+
+        # Check the test actually exists.
+        if (!-e "${bin_path}/${test_exe}")
+        {
+          next;
+        }
+
+        # Check the test actually exists
+        next unless (-e "${bin_path}/${test_exe}");
+
+        my $cmd = "${bin_path}/${test_exe} --verbose 2>&1";
+
+        printf("&&&& RUNNING $test\n");
+        printf("#### CURRENT_TIME " . current_time() . "\n");
+
+        my ($ret, $elapsed, @output) = run_cmd($cmd);
+
+        printf("********************************************************************************\n");
+        print @output;
+        printf("********************************************************************************\n");
+        my $fail = 0;
+        my $known_fail = 0;
+        my $error = 0;
+        my $pass = 0;
+        my $found_totals = 0;
+        foreach my $line (@output)
+        {
+            if (($fail, $known_fail, $error, $pass) = $line =~ /Totals: ([0-9]+) failures, ([0-9]+) known failures, ([0-9]+) errors, and ([0-9]+) passes[.]/igs) {
+              $found_totals = 1;
+              $failures = $failures + $fail;
+              $known_failures = $known_failures + $known_fail;
+              $errors = $errors + $error;
+              $passes = $passes + $pass;
+              last;
+            } else {
+              $fail = 0;
+              $known_fail = 0;
+              $error = 0;
+              $pass = 0;
+            }
+        }
+        if ($ret == 0) {
+            if ($found_totals == 0) {
+                $errors = $errors + 1;
+                printf("#### ERROR $test returned 0 and no summary line was found. Invalid test?\n");
+                printf("&&&& FAILED $test\n");
+                printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+            }
+            else {
+                if ($fail != 0 or $error != 0) {
+                    $errors = $errors + 1;
+                    printf("#### ERROR $test returned 0 and had failures or errors. Test driver error?\n");
+                    printf("&&&& FAILED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+                } elsif ($known_fail == 0 and $pass == 0) {
+                    printf("#### DISABLED $test returned 0 and had no failures, known failures, errors or passes.\n");
+                    printf("&&&& PASSED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+                } else {
+                    printf("&&&& PASSED $test\n");
+                    printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+
+                    if ($have_filecheck) {
+                        # Check output with LLVM FileCheck if the test has a FileCheck input.
+
+                        if (-f "${filecheck_data_path}/${test}.filecheck") {
+                            printf("&&&& RUNNING FileCheck $test\n");
+
+                            # If the filecheck file is empty, don't use filecheck,
+                            # just check if the output file is also empty.
+                            if (! -z "${filecheck_data_path}/${test}.filecheck") {
+                                if (@output) {
+                                    printf("&&&& PASSED FileCheck $test\n");
+                                    $passes = $passes + 1;
+                                } else {
+                                    printf("#### Output received but not expected.\n");
+                                    printf("&&&& FAILED FileCheck $test\n");
+                                    $failures = $failures + 1;
+                                }
+                            } else {
+                                my $filecheck_cmd = "$filecheck_path/FileCheck $filecheck_data_path/$test.filecheck";
+
+                                my $filecheck_pid = open(my $filecheck_stdin, "|-", "$filecheck_cmd 2>&1");
+
+                                print $filecheck_stdin @output;
+
+                                my $filecheck_ret = 0;
+                                if (close($filecheck_stdin) == 0)
+                                {
+                                  $filecheck_ret = $?;
+                                }
+
+                                if ($filecheck_ret == 0) {
+                                  printf("&&&& PASSED FileCheck $test\n");
+                                  $passes = $passes + 1;
+                                } else {
+                                  # Use a temporary file to send the output to
+                                  # FileCheck so we can get the output this time,
+                                  # because Perl and bidirectional pipes suck.
+                                  my $tmp = File::Temp->new();
+                                  my $tmp_filename = $tmp->filename;
+                                  print $tmp @output;
+
+                                  printf("********************************************************************************\n");
+                                  print `$filecheck_cmd -input-file $tmp_filename`;
+                                  printf("********************************************************************************\n");
+
+                                  process_return_code("FileCheck $test", $filecheck_ret, "");
+                                  printf("&&&& FAILED FileCheck $test\n");
+                                  $failures = $failures + 1;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        } else {
+            $errors = $errors + 1;
+            process_return_code($test, $ret, "Test crash?");
+            printf("&&&& FAILED $test\n");
+            printf("#### WALLTIME $test %.2f [s]\n", $elapsed);
+        }
+        printf("\n");
+    }
+}
+
+sub dvs_summary {
+    my $dvs_score = 0;
+    my $denominator = $failures + $known_failures + $errors + $passes;
+    if ($denominator == 0) {
+       $dvs_score = 0;
+    }
+    else {
+       $dvs_score = 100 * (($passes + $known_failures) / $denominator);
+    }
+
+    printf("\n");
+
+    printf("%*%*%*%* FA!LUR3S       $failures\n");
+    printf("%*%*%*%* KN0WN FA!LUR3S $known_failures\n");
+    printf("%*%*%*%* 3RR0RS         $errors\n");
+    printf("%*%*%*%* PASS3S         $passes\n");
+
+    printf("\n");
+
+    # We can't remove "sanity" here yet because DVS looks for this exact string.
+    printf("CUDA DVS BASIC SANITY SCORE : %.1f\n", $dvs_score);
+
+    if ($failures + $errors > 0) {
+        exit(1);
+    }
+}
+
+###############################################################################
+
+printf("#### CONFIG arch `%s`\n", $arch);
+printf("#### CONFIG abi `%s`\n", $abi);
+printf("#### CONFIG os `%s`\n", $os);
+printf("#### CONFIG build `%s`\n", $build);
+printf("#### CONFIG bin_path `%s`\n", $bin_path);
+printf("#### CONFIG have_filecheck `$have_filecheck`\n");
+printf("#### CONFIG filecheck_path `%s`\n", $filecheck_path);
+printf("#### CONFIG filecheck_data_path `%s`\n", $filecheck_data_path);
+printf("#### CONFIG have_time_hi_res `$have_time_hi_res`\n");
+printf("#### CONFIG timeout_min `%s`\n", $timeout_min);
+printf("#### ENV PATH `%s`\n", defined $ENV{'PATH'} ? $ENV{'PATH'} : '');
+printf("#### ENV LD_LIBRARY_PATH `%s`\n", defined $ENV{'LD_LIBRARY_PATH'} ? $ENV{'LD_LIBRARY_PATH'} : '');
+
+printf("\n");
+
+filecheck_smoke_test();
+
+printf("\n");
+
+my $START_TIME = current_time();
+
+run_examples();
+run_unit_tests();
+
+my $STOP_TIME = current_time();
+
+printf("#### START_TIME $START_TIME\n");
+printf("#### STOP_TIME $STOP_TIME\n");
+
+dvs_summary();
+
diff --git a/internal/test/unittest.lst b/internal/test/unittest.lst
new file mode 100644
index 000000000..8ea415184
--- /dev/null
+++ b/internal/test/unittest.lst
@@ -0,0 +1,1267 @@
+TestAdjacentDifference
+TestAdjacentDifferenceCudaStreams
+TestAdjacentDifferenceDeviceSeq
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceDispatchExplicit
+TestAdjacentDifferenceDispatchImplicit
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAdvanceDevice
+TestAdvanceHost
+TestAllOfCudaStreams
+TestAllOfDevice
+TestAllOfDeviceSeq
+TestAllOfDispatchExplicit
+TestAllOfDispatchImplicit
+TestAllOfHost
+TestAllocatorCustomCopyConstruct
+TestAllocatorCustomDefaultConstruct
+TestAllocatorCustomDestroy
+TestAllocatorMinimal
+TestAnyOfCudaStreams
+TestAnyOfDevice
+TestAnyOfDeviceSeq
+TestAnyOfDispatchExplicit
+TestAnyOfDispatchImplicit
+TestAnyOfHost
+TestAssertEqual
+TestAssertGEqual
+TestAssertLEqual
+TestBitAndFunctionalDevice
+TestBitAndFunctionalHost
+TestBitOrFunctionalDevice
+TestBitOrFunctionalHost
+TestBitXorFunctionalDevice
+TestBitXorFunctionalHost
+TestComplexArithmeticTransform
+TestComplexBasicArithmetic
+TestComplexBinaryArithmetic
+TestComplexConstructors
+TestComplexExponentialFunctions
+TestComplexExponentialTransform
+TestComplexGetters
+TestComplexMemberOperators
+TestComplexPlaneTransform
+TestComplexPowerFunctions
+TestComplexPowerTransform
+TestComplexStreamOperators
+TestComplexTrigonometricFunctions
+TestComplexTrigonometricTransform
+TestComplexUnaryArithmetic
+TestComputeCapability
+TestConstantIteratorComparison
+TestConstantIteratorConstructFromConvertibleSystem
+TestConstantIteratorCopyDevice
+TestConstantIteratorCopyHost
+TestConstantIteratorIncrement
+TestConstantIteratorReduce
+TestConstantIteratorTransformDevice
+TestConstantIteratorTransformHost
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDispatchExplicit
+TestCopyDispatchImplicit
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfDispatchExplicit
+TestCopyIfDispatchImplicit
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilDispatchExplicit
+TestCopyIfStencilDispatchImplicit
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyNConstantIteratorToZipIteratorDevice
+TestCopyNConstantIteratorToZipIteratorHost
+TestCopyNCountingIteratorDevice
+TestCopyNCountingIteratorHost
+TestCopyNDispatchExplicit
+TestCopyNDispatchImplicit
+TestCopyNFromConstIterator
+TestCopyNListToDevice
+TestCopyNListToHost
+TestCopyNMatchingTypesDevice
+TestCopyNMatchingTypesHost
+TestCopyNMixedTypesDevice
+TestCopyNMixedTypesHost
+TestCopyNToDiscardIterator
+TestCopyNVectorBool
+TestCopyNZipIteratorDevice
+TestCopyNZipIteratorHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountCudaStreams
+TestCountDeviceSeq
+TestCountDispatchExplicit
+TestCountDispatchImplicit
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfDeviceSeq
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestCountingIteratorComparison
+TestCountingIteratorCopyConstructor
+TestCountingIteratorDifference
+TestCountingIteratorDistance
+TestCountingIteratorFloatComparison
+TestCountingIteratorIncrement
+TestCountingIteratorLowerBound
+TestCountingIteratorUnsignedType
+TestCudaMallocResultAligned
+TestCudaReduceIntervals
+TestCudaReduceIntervalsSimple
+TestDeviceDeleteDestructorInvocation
+TestDeviceDereferenceCountingIterator
+TestDeviceDereferenceDevicePtr
+TestDeviceDereferenceDeviceVectorIterator
+TestDeviceDereferenceTransformIterator
+TestDeviceDereferenceTransformedCountingIterator
+TestDevicePointerManipulation
+TestDeviceReferenceAssignmentFromDeviceReference
+TestDeviceReferenceConstructorFromDevicePointer
+TestDeviceReferenceConstructorFromDeviceReference
+TestDeviceReferenceManipulation
+TestDiscardIteratorComparison
+TestDiscardIteratorIncrement
+TestDistanceDevice
+TestDistanceHost
+TestDividesFunctionalDevice
+TestDividesFunctionalHost
+TestEqual
+TestEqualCudaStreams
+TestEqualDeviceSeq
+TestEqualDispatchExplicit
+TestEqualDispatchImplicit
+TestEqualSimpleDevice
+TestEqualSimpleHost
+TestEqualToFunctionalDevice
+TestEqualToFunctionalHost
+TestExclusiveScan32
+TestExclusiveScanByKeyCudaStreams
+TestExclusiveScanByKeyDispatchExplicit
+TestExclusiveScanByKeyDispatchImplicit
+TestExclusiveScanByKeySimpleDevice
+TestExclusiveScanByKeySimpleHost
+TestExclusiveScanDispatchExplicit
+TestExclusiveScanDispatchImplicit
+TestFill
+TestFillCudaStreams
+TestFillDeviceSeq
+TestFillDiscardIterator
+TestFillDispatchExplicit
+TestFillDispatchImplicit
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDeviceSeq
+TestFillNDiscardIterator
+TestFillNDispatchExplicit
+TestFillNDispatchImplicit
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestFind
+TestFindCudaStreams
+TestFindDeviceSeq
+TestFindDispatchExplicit
+TestFindDispatchImplicit
+TestFindIf
+TestFindIfDeviceSeq
+TestFindIfDispatchExplicit
+TestFindIfDispatchImplicit
+TestFindIfNot
+TestFindIfNotDeviceSeq
+TestFindIfNotDispatchExplicit
+TestFindIfNotDispatchImplicit
+TestFindIfNotSimpleDevice
+TestFindIfNotSimpleHost
+TestFindIfSimpleDevice
+TestFindIfSimpleHost
+TestFindSimpleDevice
+TestFindSimpleHost
+TestForEach
+TestForEachCudaStreams
+TestForEachDeviceSeq
+TestForEachDispatchExplicit
+TestForEachDispatchImplicit
+TestForEachLargeRegisterFootprint
+TestForEachN
+TestForEachNDeviceSeq
+TestForEachNDispatchExplicit
+TestForEachNDispatchImplicit
+TestForEachNLargeRegisterFootprint
+TestForEachNSimpleAnySystem
+TestForEachNSimpleDevice
+TestForEachNSimpleHost
+TestForEachNWithLargeTypes
+TestForEachSimpleAnySystem
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestFreeDispatchExplicit
+TestFunctionalPlaceholdersBinaryEqualToDevice
+TestFunctionalPlaceholdersBinaryEqualToHost
+TestFunctionalPlaceholdersBinaryGreaterDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualHost
+TestFunctionalPlaceholdersBinaryGreaterHost
+TestFunctionalPlaceholdersBinaryLessDevice
+TestFunctionalPlaceholdersBinaryLessEqualDevice
+TestFunctionalPlaceholdersBinaryLessEqualHost
+TestFunctionalPlaceholdersBinaryLessHost
+TestFunctionalPlaceholdersBinaryNotEqualToDevice
+TestFunctionalPlaceholdersBinaryNotEqualToHost
+TestFunctionalPlaceholdersBitAnd<thrust::device_vector>
+TestFunctionalPlaceholdersBitAnd<thrust::host_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitNegateDevice
+TestFunctionalPlaceholdersBitNegateHost
+TestFunctionalPlaceholdersBitOr<thrust::device_vector>
+TestFunctionalPlaceholdersBitOr<thrust::host_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitXor<thrust::device_vector>
+TestFunctionalPlaceholdersBitXor<thrust::host_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::host_vector>
+TestFunctionalPlaceholdersDivides<thrust::device_vector>
+TestFunctionalPlaceholdersDivides<thrust::host_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersLogicalAndDevice
+TestFunctionalPlaceholdersLogicalAndHost
+TestFunctionalPlaceholdersLogicalNotDevice
+TestFunctionalPlaceholdersLogicalNotHost
+TestFunctionalPlaceholdersLogicalOrDevice
+TestFunctionalPlaceholdersLogicalOrHost
+TestFunctionalPlaceholdersMinus<thrust::device_vector>
+TestFunctionalPlaceholdersMinus<thrust::host_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersModulus<thrust::device_vector>
+TestFunctionalPlaceholdersModulus<thrust::host_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::device_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::host_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersNegateDevice
+TestFunctionalPlaceholdersNegateHost
+TestFunctionalPlaceholdersPlus<thrust::device_vector>
+TestFunctionalPlaceholdersPlus<thrust::host_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersPrefixDecrementDevice
+TestFunctionalPlaceholdersPrefixDecrementHost
+TestFunctionalPlaceholdersPrefixIncrementDevice
+TestFunctionalPlaceholdersPrefixIncrementHost
+TestFunctionalPlaceholdersSuffixDecrementDevice
+TestFunctionalPlaceholdersSuffixDecrementHost
+TestFunctionalPlaceholdersSuffixIncrementDevice
+TestFunctionalPlaceholdersSuffixIncrementHost
+TestFunctionalPlaceholdersTransformIterator<thrust::device_vector>
+TestFunctionalPlaceholdersTransformIterator<thrust::host_vector>
+TestFunctionalPlaceholdersUnaryPlusDevice
+TestFunctionalPlaceholdersUnaryPlusHost
+TestFunctionalPlaceholdersValue<thrust::device_vector>
+TestFunctionalPlaceholdersValue<thrust::host_vector>
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherCudaStreams
+TestGatherDeviceSeq
+TestGatherDispatchExplicit
+TestGatherDispatchImplicit
+TestGatherIf
+TestGatherIfCudaStreams
+TestGatherIfDeviceSeq
+TestGatherIfDispatchExplicit
+TestGatherIfDispatchImplicit
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateCudaStreams
+TestGenerateDeviceSeq
+TestGenerateDispatchExplicit
+TestGenerateDispatchImplicit
+TestGenerateNCudaStreams
+TestGenerateNDeviceSeq
+TestGenerateNDispatchExplicit
+TestGenerateNDispatchImplicit
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestGetTemporaryBuffer
+TestGetTemporaryBufferDeviceSeq
+TestGetTemporaryBufferDispatchExplicit
+TestGetTemporaryBufferDispatchImplicit
+TestGreaterEqualFunctionalDevice
+TestGreaterEqualFunctionalHost
+TestGreaterFunctionalDevice
+TestGreaterFunctionalHost
+TestIdentityFunctionalDevice
+TestIdentityFunctionalHost
+TestInclusiveScan32
+TestInclusiveScanByKeyCudaStreams
+TestInclusiveScanByKeyDispatchExplicit
+TestInclusiveScanByKeyDispatchImplicit
+TestInclusiveScanByKeySimpleDevice
+TestInclusiveScanByKeySimpleHost
+TestInclusiveScanByKeyTransformIteratorDevice
+TestInclusiveScanByKeyTransformIteratorHost
+TestInclusiveScanDispatchExplicit
+TestInclusiveScanDispatchImplicit
+TestInclusiveScanWithIndirectionDevice
+TestInclusiveScanWithIndirectionHost
+TestInnerProduct
+TestInnerProductCudaStreams
+TestInnerProductDeviceSeq
+TestInnerProductDispatchExplicit
+TestInnerProductDispatchImplicit
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPartitionedCudaStreams
+TestIsPartitionedDevice
+TestIsPartitionedDeviceSeq
+TestIsPartitionedDispatchExplicit
+TestIsPartitionedDispatchImplicit
+TestIsPartitionedHost
+TestIsPartitionedSimpleDevice
+TestIsPartitionedSimpleHost
+TestIsPlainOldData
+TestIsSortedCudaStreams
+TestIsSortedDevice
+TestIsSortedDeviceSeq
+TestIsSortedDispatchExplicit
+TestIsSortedDispatchImplicit
+TestIsSortedHost
+TestIsSortedRepeatedElementsDevice
+TestIsSortedRepeatedElementsHost
+TestIsSortedSimpleDevice
+TestIsSortedSimpleHost
+TestIsSortedUntilCudaStreams
+TestIsSortedUntilDevice
+TestIsSortedUntilDeviceSeq
+TestIsSortedUntilExplicit
+TestIsSortedUntilHost
+TestIsSortedUntilImplicit
+TestIsSortedUntilRepeatedElementsDevice
+TestIsSortedUntilRepeatedElementsHost
+TestIsSortedUntilSimpleDevice
+TestIsSortedUntilSimpleHost
+TestIsTrivialIterator
+TestLessEqualFunctionalDevice
+TestLessEqualFunctionalHost
+TestLessFunctionalDevice
+TestLessFunctionalHost
+TestLog2
+TestLogicalAndFunctionalDevice
+TestLogicalAndFunctionalHost
+TestLogicalNotFunctionalDevice
+TestLogicalNotFunctionalHost
+TestLogicalOrFunctionalDevice
+TestLogicalOrFunctionalHost
+TestMakeConstantIterator
+TestMakeDevicePointer
+TestMakeDiscardIterator
+TestMakePermutationIteratorDevice
+TestMakePermutationIteratorHost
+TestMakeTransformIteratorDevice
+TestMakeTransformIteratorHost
+TestMakeTuple
+TestMalloc
+TestMallocDeviceSeq
+TestMallocDispatchExplicit
+TestMax
+TestMaxActiveBlocks
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementCudaStreams
+TestMaxElementDeviceSeq
+TestMaxElementDispatchExplicit
+TestMaxElementDispatchImplicit
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMaximumFunctionalDevice
+TestMaximumFunctionalHost
+TestMerge
+TestMergeByKey
+TestMergeByKeyCudaStreams
+TestMergeByKeyDescending
+TestMergeByKeyDeviceSeq
+TestMergeByKeyDispatchExplicit
+TestMergeByKeyDispatchImplicit
+TestMergeByKeySimpleDevice
+TestMergeByKeySimpleHost
+TestMergeByKeyToDiscardIterator
+TestMergeCudaStreams
+TestMergeDescending
+TestMergeDeviceSeq
+TestMergeDispatchExplicit
+TestMergeDispatchImplicit
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeSortAscendingKeyValue
+TestMergeSortDescendingKey
+TestMergeSortDescendingKeyValue
+TestMergeSortKeySimple
+TestMergeSortKeyValue
+TestMergeSortKeyValueSimple
+TestMergeSortStableKeySimple
+TestMergeToDiscardIterator
+TestMin
+TestMinElement
+TestMinElementCudaStreams
+TestMinElementDeviceSeq
+TestMinElementDispatchExplicit
+TestMinElementDispatchImplicit
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementCudaStreams
+TestMinMaxElementDeviceSeq
+TestMinMaxElementDispatchExplicit
+TestMinMaxElementDispatchImplicit
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestMinimumFunctionalDevice
+TestMinimumFunctionalHost
+TestMinstdRand0Equal
+TestMinstdRand0Max
+TestMinstdRand0Min
+TestMinstdRand0SaveRestore
+TestMinstdRand0Unequal
+TestMinstdRand0Validation
+TestMinstdRandEqual
+TestMinstdRandMax
+TestMinstdRandMin
+TestMinstdRandSaveRestore
+TestMinstdRandUnequal
+TestMinstdRandValidation
+TestMinusFunctionalDevice
+TestMinusFunctionalHost
+TestMismatchCudaStreams
+TestMismatchDeviceSeq
+TestMismatchDispatchExplicit
+TestMismatchDispatchImplicit
+TestMismatchSimpleDevice
+TestMismatchSimpleHost
+TestModulusFunctionalDevice
+TestModulusFunctionalHost
+TestMultipliesFunctionalDevice
+TestMultipliesFunctionalHost
+TestNegateFunctionalDevice
+TestNegateFunctionalHost
+TestNoneOfCudaStreams
+TestNoneOfDevice
+TestNoneOfDeviceSeq
+TestNoneOfDispatchExplicit
+TestNoneOfDispatchImplicit
+TestNoneOfHost
+TestNormalDistributionMax
+TestNormalDistributionMin
+TestNormalDistributionSaveRestore
+TestNot1Device
+TestNot1Host
+TestNot2Device
+TestNot2Host
+TestNotEqualToFunctionalDevice
+TestNotEqualToFunctionalHost
+TestPairComparison
+TestPairGet
+TestPairManipulation
+TestPairReduce
+TestPairScan
+TestPairScanByKey
+TestPairStableSort
+TestPairStableSortByKey
+TestPairStableSortByKeyDeviceSeq
+TestPairStableSortDeviceSeq
+TestPairSwap
+TestPairTransform
+TestPairTupleElement
+TestPairTupleSize
+TestPartition
+TestPartitionCopy
+TestPartitionCopyDeviceSeq
+TestPartitionCopyDispatchExplicit
+TestPartitionCopyDispatchImplicit
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyStencil
+TestPartitionCopyStencilDispatchExplicit
+TestPartitionCopyStencilDispatchImplicit
+TestPartitionCopyStencilSimpleDevice
+TestPartitionCopyStencilSimpleHost
+TestPartitionCopyStencilToDiscardIterator
+TestPartitionCopyToDiscardIterator
+TestPartitionCudaStreams
+TestPartitionDeviceSeq
+TestPartitionDispatchExplicit
+TestPartitionDispatchImplicit
+TestPartitionPointCudaStreams
+TestPartitionPointDevice
+TestPartitionPointDeviceSeq
+TestPartitionPointDispatchExplicit
+TestPartitionPointDispatchImplicit
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionStencil
+TestPartitionStencilDeviceSeq
+TestPartitionStencilDispatchExplicit
+TestPartitionStencilDispatchImplicit
+TestPartitionStencilSimpleDevice
+TestPartitionStencilSimpleHost
+TestPartitionStencilZipIteratorDevice
+TestPartitionStencilZipIteratorHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestPermutationIteratorGatherDevice
+TestPermutationIteratorGatherHost
+TestPermutationIteratorHostDeviceGather
+TestPermutationIteratorHostDeviceScatter
+TestPermutationIteratorReduceDevice
+TestPermutationIteratorReduceHost
+TestPermutationIteratorScatterDevice
+TestPermutationIteratorScatterHost
+TestPermutationIteratorSimpleDevice
+TestPermutationIteratorSimpleHost
+TestPermutationIteratorWithCountingIteratorDevice
+TestPermutationIteratorWithCountingIteratorHost
+TestPinnedAllocatorSimple
+TestPlusFunctionalDevice
+TestPlusFunctionalHost
+TestProject1stFunctionalDevice
+TestProject1stFunctionalHost
+TestProject2ndFunctionalDevice
+TestProject2ndFunctionalHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortByKeyLongLongValues
+TestRadixSortByKeyShortValues
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestRanlux24BaseEqual
+TestRanlux24BaseMax
+TestRanlux24BaseMin
+TestRanlux24BaseSaveRestore
+TestRanlux24BaseUnequal
+TestRanlux24BaseValidation
+TestRanlux24Equal
+TestRanlux24Max
+TestRanlux24Min
+TestRanlux24SaveRestore
+TestRanlux24Unequal
+TestRanlux24Validation
+TestRanlux48BaseEqual
+TestRanlux48BaseMax
+TestRanlux48BaseMin
+TestRanlux48BaseSaveRestore
+TestRanlux48BaseUnequal
+TestRanlux48BaseValidation
+TestRanlux48Equal
+TestRanlux48Max
+TestRanlux48Min
+TestRanlux48SaveRestore
+TestRanlux48Unequal
+TestRanlux48Validation
+TestRawPointerCastDevice
+TestRawPointerCastHost
+TestReduce
+TestReduceByKey
+TestReduceByKeyCudaStreams
+TestReduceByKeyDeviceSeq
+TestReduceByKeyDispatchExplicit
+TestReduceByKeyDispatchImplicit
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceCountingIterator
+TestReduceCudaStreams
+TestReduceDeviceSeq
+TestReduceDispatchExplicit
+TestReduceDispatchImplicit
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithLargeTypes
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyCudaStreams
+TestRemoveCopyDeviceSeq
+TestRemoveCopyDispatchExplicit
+TestRemoveCopyDispatchImplicit
+TestRemoveCopyIf
+TestRemoveCopyIfCudaStreams
+TestRemoveCopyIfDeviceSeq
+TestRemoveCopyIfDispatchExplicit
+TestRemoveCopyIfDispatchImplicit
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilCudaStreams
+TestRemoveCopyIfStencilDeviceSeq
+TestRemoveCopyIfStencilDispatchExplicit
+TestRemoveCopyIfStencilDispatchImplicit
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveCudaStreams
+TestRemoveDeviceSeq
+TestRemoveDispatchExplicit
+TestRemoveDispatchImplicit
+TestRemoveIf
+TestRemoveIfCudaStreams
+TestRemoveIfDeviceSeq
+TestRemoveIfDispatchExplicit
+TestRemoveIfDispatchImplicit
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilCudaStreams
+TestRemoveIfStencilDeviceSeq
+TestRemoveIfStencilDispatchExplicit
+TestRemoveIfStencilDispatchImplicit
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyDeviceSeq
+TestReplaceCopyDispatchExplicit
+TestReplaceCopyDispatchImplicit
+TestReplaceCopyIf
+TestReplaceCopyIfDeviceSeq
+TestReplaceCopyIfDispatchExplicit
+TestReplaceCopyIfDispatchImplicit
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilDeviceSeq
+TestReplaceCopyIfStencilDispatchExplicit
+TestReplaceCopyIfStencilDispatchImplicit
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceCudaStreams
+TestReplaceDeviceSeq
+TestReplaceDispatchExplicit
+TestReplaceDispatchImplicit
+TestReplaceIf
+TestReplaceIfDeviceSeq
+TestReplaceIfDispatchExplicit
+TestReplaceIfDispatchImplicit
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilDeviceSeq
+TestReplaceIfStencilDispatchExplicit
+TestReplaceIfStencilDispatchImplicit
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopyDeviceSeq
+TestReverseCopyDispatchExplicit
+TestReverseCopyDispatchImplicit
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseCudaStreams
+TestReverseDeviceSeq
+TestReverseDispatchExplicit
+TestReverseDispatchImplicit
+TestReverseIteratorCopyConstructor
+TestReverseIteratorCopyDevice
+TestReverseIteratorCopyHost
+TestReverseIteratorExclusiveScan
+TestReverseIteratorExclusiveScanSimple
+TestReverseIteratorIncrement
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestScalarBinarySearchDescendingSimpleDevice
+TestScalarBinarySearchDescendingSimpleHost
+TestScalarBinarySearchDispatchExplicit
+TestScalarBinarySearchDispatchImplicit
+TestScalarBinarySearchSimpleDevice
+TestScalarBinarySearchSimpleHost
+TestScalarEqualRangeDescendingSimpleDevice
+TestScalarEqualRangeDescendingSimpleHost
+TestScalarEqualRangeDispatchExplicit
+TestScalarEqualRangeDispatchImplicit
+TestScalarEqualRangeSimpleDevice
+TestScalarEqualRangeSimpleHost
+TestScalarLowerBoundDescendingSimpleDevice
+TestScalarLowerBoundDescendingSimpleHost
+TestScalarLowerBoundDispatchExplicit
+TestScalarLowerBoundDispatchImplicit
+TestScalarLowerBoundSimpleDevice
+TestScalarLowerBoundSimpleHost
+TestScalarUpperBoundDescendingSimpleDevice
+TestScalarUpperBoundDescendingSimpleHost
+TestScalarUpperBoundDispatchExplicit
+TestScalarUpperBoundDispatchImplicit
+TestScalarUpperBoundSimpleDevice
+TestScalarUpperBoundSimpleHost
+TestScan
+TestScanByKeyDeviceSeq
+TestScanByKeyHeadFlagsDevice
+TestScanByKeyHeadFlagsHost
+TestScanByKeyLargeInput
+TestScanByKeyMixedTypes
+TestScanByKeyReusedKeysDevice
+TestScanByKeyReusedKeysHost
+TestScanByKeyWithLargeTypes
+TestScanCudaStreams
+TestScanDeviceDevice
+TestScanDeviceSeq
+TestScanMixedTypes
+TestScanMixedTypesDevice
+TestScanMixedTypesHost
+TestScanSimpleDevice
+TestScanSimpleHost
+TestScanToDiscardIterator
+TestScanWithLargeTypes
+TestScanWithOperator
+TestScanWithOperatorToDiscardIterator
+TestScatter
+TestScatterCountingIteratorDevice
+TestScatterCountingIteratorHost
+TestScatterCudaStreams
+TestScatterDeviceSeq
+TestScatterDispatchExplicit
+TestScatterDispatchImplicit
+TestScatterIf
+TestScatterIfCountingIteratorDevice
+TestScatterIfCountingIteratorHost
+TestScatterIfCudaStreams
+TestScatterIfDeviceSeq
+TestScatterIfDispatchExplicit
+TestScatterIfDispatchImplicit
+TestScatterIfSimpleDevice
+TestScatterIfSimpleHost
+TestScatterIfToDiscardIterator
+TestScatterSimpleDevice
+TestScatterSimpleHost
+TestScatterToDiscardIterator
+TestSelectSystemCudaToCpp
+TestSelectSystemDifferentTypes
+TestSelectSystemSameTypes
+TestSequence
+TestSequenceCudaStreams
+TestSequenceDeviceSeq
+TestSequenceDispatchExplicit
+TestSequenceDispatchImplicit
+TestSequenceSimpleDevice
+TestSequenceSimpleHost
+TestSequenceToDiscardIterator
+TestSetDifference
+TestSetDifferenceByKey
+TestSetDifferenceByKeyCudaStreams
+TestSetDifferenceByKeyDescending
+TestSetDifferenceByKeyDescendingSimpleDevice
+TestSetDifferenceByKeyDescendingSimpleHost
+TestSetDifferenceByKeyDeviceSeq
+TestSetDifferenceByKeyDispatchExplicit
+TestSetDifferenceByKeyDispatchImplicit
+TestSetDifferenceByKeyEquivalentRanges
+TestSetDifferenceByKeyMultiset
+TestSetDifferenceByKeySimpleDevice
+TestSetDifferenceByKeySimpleHost
+TestSetDifferenceCudaStreams
+TestSetDifferenceDescending
+TestSetDifferenceDescendingSimpleDevice
+TestSetDifferenceDescendingSimpleHost
+TestSetDifferenceDeviceSeq
+TestSetDifferenceDispatchExplicit
+TestSetDifferenceDispatchImplicit
+TestSetDifferenceEquivalentRanges
+TestSetDifferenceKeyValue
+TestSetDifferenceMultiset
+TestSetDifferenceSimpleDevice
+TestSetDifferenceSimpleHost
+TestSetIntersection
+TestSetIntersectionByKey
+TestSetIntersectionByKeyCudaStreams
+TestSetIntersectionByKeyDescending
+TestSetIntersectionByKeyDescendingSimpleDevice
+TestSetIntersectionByKeyDescendingSimpleHost
+TestSetIntersectionByKeyDeviceSeq
+TestSetIntersectionByKeyDispatchExplicit
+TestSetIntersectionByKeyDispatchImplicit
+TestSetIntersectionByKeyEquivalentRanges
+TestSetIntersectionByKeyMultiset
+TestSetIntersectionByKeySimpleDevice
+TestSetIntersectionByKeySimpleHost
+TestSetIntersectionCudaStreams
+TestSetIntersectionDescending
+TestSetIntersectionDescendingSimpleDevice
+TestSetIntersectionDescendingSimpleHost
+TestSetIntersectionDeviceSeq
+TestSetIntersectionDispatchExplicit
+TestSetIntersectionDispatchImplicit
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionKeyValue
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceByKey
+TestSetSymmetricDifferenceByKeyCudaStreams
+TestSetSymmetricDifferenceByKeyDescending
+TestSetSymmetricDifferenceByKeyDescendingSimpleDevice
+TestSetSymmetricDifferenceByKeyDescendingSimpleHost
+TestSetSymmetricDifferenceByKeyDeviceSeq
+TestSetSymmetricDifferenceByKeyDispatchExplicit
+TestSetSymmetricDifferenceByKeyDispatchImplicit
+TestSetSymmetricDifferenceByKeyEquivalentRanges
+TestSetSymmetricDifferenceByKeyMultiset
+TestSetSymmetricDifferenceByKeySimpleDevice
+TestSetSymmetricDifferenceByKeySimpleHost
+TestSetSymmetricDifferenceCudaStreams
+TestSetSymmetricDifferenceDescending
+TestSetSymmetricDifferenceDescendingSimpleDevice
+TestSetSymmetricDifferenceDescendingSimpleHost
+TestSetSymmetricDifferenceDeviceSeq
+TestSetSymmetricDifferenceDispatchExplicit
+TestSetSymmetricDifferenceDispatchImplicit
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionByKey
+TestSetUnionByKeyCudaStreams
+TestSetUnionByKeyDescending
+TestSetUnionByKeyDescendingSimpleDevice
+TestSetUnionByKeyDescendingSimpleHost
+TestSetUnionByKeyDeviceSeq
+TestSetUnionByKeyDispatchExplicit
+TestSetUnionByKeyDispatchImplicit
+TestSetUnionByKeyEquivalentRanges
+TestSetUnionByKeyMultiset
+TestSetUnionByKeySimpleDevice
+TestSetUnionByKeySimpleHost
+TestSetUnionCudaStreams
+TestSetUnionDescending
+TestSetUnionDescendingSimpleDevice
+TestSetUnionDescendingSimpleHost
+TestSetUnionDeviceSeq
+TestSetUnionDispatchExplicit
+TestSetUnionDispatchImplicit
+TestSetUnionKeyValue
+TestSetUnionKeyValueDescending
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestSortAscendingKey
+TestSortAscendingKeyValue
+TestSortBool
+TestSortBoolDescending
+TestSortByKeyBool
+TestSortByKeyBoolDescending
+TestSortByKeyCudaStreams
+TestSortByKeyDeviceSeq
+TestSortByKeyDispatchExplicit
+TestSortByKeyDispatchImplicit
+TestSortByKeyPermutationIteratorDevice
+TestSortByKeyPermutationIteratorHost
+TestSortByKeySimpleDevice
+TestSortByKeySimpleHost
+TestSortByKeyVariableBits
+TestSortCudaStreams
+TestSortDescendingKey
+TestSortDescendingKeyValue
+TestSortDeviceSeq
+TestSortDispatchExplicit
+TestSortDispatchImplicit
+TestSortPermutationIteratorDevice
+TestSortPermutationIteratorHost
+TestSortSimpleDevice
+TestSortSimpleHost
+TestSortVariableBits
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopyDeviceSeq
+TestStablePartitionCopyDispatchExplicit
+TestStablePartitionCopyDispatchImplicit
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyStencil
+TestStablePartitionCopyStencilDispatchExplicit
+TestStablePartitionCopyStencilDispatchImplicit
+TestStablePartitionCopyStencilSimpleDevice
+TestStablePartitionCopyStencilSimpleHost
+TestStablePartitionCopyStencilToDiscardIterator
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionDeviceSeq
+TestStablePartitionDispatchExplicit
+TestStablePartitionDispatchImplicit
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionStencil
+TestStablePartitionStencilDeviceSeq
+TestStablePartitionStencilDispatchExplicit
+TestStablePartitionStencilDispatchImplicit
+TestStablePartitionStencilSimpleDevice
+TestStablePartitionStencilSimpleHost
+TestStablePartitionStencilZipIteratorDevice
+TestStablePartitionStencilZipIteratorHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestStableSort
+TestStableSortByKey
+TestStableSortByKeyDispatchExplicit
+TestStableSortByKeyDispatchImplicit
+TestStableSortByKeyPermutationIteratorDevice
+TestStableSortByKeyPermutationIteratorHost
+TestStableSortByKeySemantics
+TestStableSortByKeySimpleDevice
+TestStableSortByKeySimpleHost
+TestStableSortByKeyWithLargeKeys
+TestStableSortByKeyWithLargeKeysAndValues
+TestStableSortByKeyWithLargeValues
+TestStableSortDispatchExplicit
+TestStableSortDispatchImplicit
+TestStableSortPermutationIteratorDevice
+TestStableSortPermutationIteratorHost
+TestStableSortSemantics
+TestStableSortSimpleDevice
+TestStableSortSimpleHost
+TestStableSortWithIndirectionDevice
+TestStableSortWithIndirectionHost
+TestStableSortWithLargeKeys
+TestStandardIntegerTypes
+TestSwapRanges
+TestSwapRangesCudaStreams
+TestSwapRangesDeviceSeq
+TestSwapRangesDispatchExplicit
+TestSwapRangesDispatchImplicit
+TestSwapRangesSimpleDevice
+TestSwapRangesSimpleHost
+TestSwapRangesUserSwap
+TestTabulate
+TestTabulateCudaStreams
+TestTabulateDeviceSeq
+TestTabulateDispatchExplicit
+TestTabulateDispatchImplicit
+TestTabulateSimpleDevice
+TestTabulateSimpleHost
+TestTabulateToDiscardIterator
+TestTaus88Equal
+TestTaus88Max
+TestTaus88Min
+TestTaus88SaveRestore
+TestTaus88Unequal
+TestTaus88Validation
+TestTransformBinary
+TestTransformBinaryCountingIterator
+TestTransformBinaryCudaStreams
+TestTransformBinaryDeviceSeq
+TestTransformBinaryDispatchExplicit
+TestTransformBinaryDispatchImplicit
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformExclusiveScanDispatchExplicit
+TestTransformExclusiveScanDispatchImplicit
+TestTransformIfBinary
+TestTransformIfBinaryDeviceSeq
+TestTransformIfBinaryDispatchExplicit
+TestTransformIfBinaryDispatchImplicit
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryDeviceSeq
+TestTransformIfUnaryDispatchExplicit
+TestTransformIfUnaryDispatchImplicit
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilDeviceSeq
+TestTransformIfUnaryNoStencilDispatchExplicit
+TestTransformIfUnaryNoStencilDispatchImplicit
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformInclusiveScanDispatchExplicit
+TestTransformInclusiveScanDispatchImplicit
+TestTransformIteratorDevice
+TestTransformIteratorHost
+TestTransformIteratorReduce
+TestTransformReduce
+TestTransformReduceCountingIteratorDevice
+TestTransformReduceCountingIteratorHost
+TestTransformReduceCudaStreams
+TestTransformReduceDeviceSeq
+TestTransformReduceDispatchExplicit
+TestTransformReduceDispatchImplicit
+TestTransformReduceFromConst
+TestTransformReduceSimpleDevice
+TestTransformReduceSimpleHost
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanCudaStreams
+TestTransformScanDeviceSeq
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIterator
+TestTransformUnaryCudaStreams
+TestTransformUnaryDeviceSeq
+TestTransformUnaryDispatchExplicit
+TestTransformUnaryDispatchImplicit
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestTrivialSequenceDevice
+TestTrivialSequenceHost
+TestTupleComparison
+TestTupleConstructor
+TestTupleGet
+TestTupleReduce
+TestTupleScan
+TestTupleStableSort
+TestTupleSwap
+TestTupleTie
+TestTupleTransform
+TestTypeName
+TestUniformDecomposition
+TestUniformIntDistributionMax
+TestUniformIntDistributionMin
+TestUniformIntDistributionSaveRestore
+TestUniformRealDistributionMax
+TestUniformRealDistributionMin
+TestUniformRealDistributionSaveRestore
+TestUninitializedCopyCudaStreams
+TestUninitializedCopyDeviceSeq
+TestUninitializedCopyDispatchExplicit
+TestUninitializedCopyDispatchImplicit
+TestUninitializedCopyNCudaStreams
+TestUninitializedCopyNDeviceSeq
+TestUninitializedCopyNDispatchExplicit
+TestUninitializedCopyNDispatchImplicit
+TestUninitializedCopyNNonPODDevice
+TestUninitializedCopyNNonPODHost
+TestUninitializedCopyNSimplePODDevice
+TestUninitializedCopyNSimplePODHost
+TestUninitializedCopyNonPODDevice
+TestUninitializedCopyNonPODHost
+TestUninitializedCopySimplePODDevice
+TestUninitializedCopySimplePODHost
+TestUninitializedFillCudaStreams
+TestUninitializedFillDeviceSeq
+TestUninitializedFillDispatchExplicit
+TestUninitializedFillDispatchImplicit
+TestUninitializedFillNCudaStreams
+TestUninitializedFillNDeviceSeq
+TestUninitializedFillNDispatchExplicit
+TestUninitializedFillNDispatchImplicit
+TestUninitializedFillNNonPOD
+TestUninitializedFillNPODDevice
+TestUninitializedFillNPODHost
+TestUninitializedFillNonPOD
+TestUninitializedFillPODDevice
+TestUninitializedFillPODHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeyCopyDispatchExplicit
+TestUniqueByKeyCopyDispatchImplicit
+TestUniqueByKeyCudaStreams
+TestUniqueByKeyDeviceSeq
+TestUniqueByKeyDispatchExplicit
+TestUniqueByKeyDispatchImplicit
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeyCudaStreams
+TestUniqueCopyByKeyDeviceSeq
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopyCudaStreams
+TestUniqueCopyDeviceSeq
+TestUniqueCopyDispatchExplicit
+TestUniqueCopyDispatchImplicit
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueCudaStreams
+TestUniqueDeviceSeq
+TestUniqueDispatchExplicit
+TestUniqueDispatchImplicit
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorAssignFromBiDirectionalIteratorDevice
+TestVectorAssignFromBiDirectionalIteratorHost
+TestVectorAssignFromDeviceVectorDevice
+TestVectorAssignFromDeviceVectorHost
+TestVectorAssignFromHostVectorDevice
+TestVectorAssignFromHostVectorHost
+TestVectorAssignFromSTLVectorDevice
+TestVectorAssignFromSTLVectorHost
+TestVectorBinarySearch
+TestVectorBinarySearchDescending
+TestVectorBinarySearchDescendingSimpleDevice
+TestVectorBinarySearchDescendingSimpleHost
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchDispatchExplicit
+TestVectorBinarySearchDispatchImplicit
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorBool
+TestVectorContainingLargeType
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorDataDevice
+TestVectorDataHost
+TestVectorElementAssignmentDevice
+TestVectorElementAssignmentHost
+TestVectorEquality
+TestVectorErasePositionDevice
+TestVectorErasePositionHost
+TestVectorEraseRangeDevice
+TestVectorEraseRangeHost
+TestVectorFillAssignDevice
+TestVectorFillAssignHost
+TestVectorFillInsert
+TestVectorFillInsertSimple<thrust::device_vector>
+TestVectorFillInsertSimple<thrust::host_vector>
+TestVectorFromBiDirectionalIteratorDevice
+TestVectorFromBiDirectionalIteratorHost
+TestVectorFromSTLVectorDevice
+TestVectorFromSTLVectorHost
+TestVectorFrontBackDevice
+TestVectorFrontBackHost
+TestVectorInequality
+TestVectorLowerBound
+TestVectorLowerBoundDescending
+TestVectorLowerBoundDescendingSimpleDevice
+TestVectorLowerBoundDescendingSimpleHost
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundDispatchExplicit
+TestVectorLowerBoundDispatchImplicit
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorManipulationDevice
+TestVectorManipulationHost
+TestVectorRangeInsert
+TestVectorRangeInsertSimple<thrust::device_vector>
+TestVectorRangeInsertSimple<thrust::host_vector>
+TestVectorReservingDevice
+TestVectorReservingHost
+TestVectorResizingDevice
+TestVectorResizingHost
+TestVectorReversedDevice
+TestVectorReversedHost
+TestVectorShrinkToFitDevice
+TestVectorShrinkToFitHost
+TestVectorSwapDevice
+TestVectorSwapHost
+TestVectorToAndFromDeviceVectorDevice
+TestVectorToAndFromDeviceVectorHost
+TestVectorToAndFromHostVectorDevice
+TestVectorToAndFromHostVectorHost
+TestVectorUpperBound
+TestVectorUpperBoundDescending
+TestVectorUpperBoundDescendingSimpleDevice
+TestVectorUpperBoundDescendingSimpleHost
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundDispatchExplicit
+TestVectorUpperBoundDispatchImplicit
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
+TestVectorWithInitialValueDevice
+TestVectorWithInitialValueHost
+TestVectorZeroSizeDevice
+TestVectorZeroSizeHost
+TestZipIteratorCopyAoSToSoA
+TestZipIteratorCopyDevice
+TestZipIteratorCopyHost
+TestZipIteratorCopySoAToAoS
+TestZipIteratorManipulation
+TestZipIteratorReduce
+TestZipIteratorReduceByKey
+TestZipIteratorReference
+TestZipIteratorScan
+TestZipIteratorStableSort
+TestZipIteratorStableSortByKey
+TestZipIteratorSystem
+TestZipIteratorTransform
+TestZipIteratorTraversal
+TestZippedDiscardIterator
diff --git a/internal/test/unittest_omp.lst b/internal/test/unittest_omp.lst
new file mode 100644
index 000000000..f59230e89
--- /dev/null
+++ b/internal/test/unittest_omp.lst
@@ -0,0 +1,808 @@
+TestAdjacentDifference
+TestAdjacentDifferenceDiscardIterator
+TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes
+TestAdjacentDifferenceSimpleDevice
+TestAdjacentDifferenceSimpleHost
+TestAdvanceDevice
+TestAdvanceHost
+TestAllOfDevice
+TestAllOfHost
+TestAnyOfDevice
+TestAnyOfHost
+TestAssertEqual
+TestAssertGEqual
+TestAssertLEqual
+TestBitAndFunctionalDevice
+TestBitAndFunctionalHost
+TestBitOrFunctionalDevice
+TestBitOrFunctionalHost
+TestBitXorFunctionalDevice
+TestBitXorFunctionalHost
+TestComputeCapability
+TestConstantIteratorComparison
+TestConstantIteratorConstructFromConvertibleSpace
+TestConstantIteratorCopyDevice
+TestConstantIteratorCopyHost
+TestConstantIteratorIncrement
+TestConstantIteratorReduce
+TestConstantIteratorTransformDevice
+TestConstantIteratorTransformHost
+TestCopyConstantIteratorToZipIteratorDevice
+TestCopyConstantIteratorToZipIteratorHost
+TestCopyCountingIteratorDevice
+TestCopyCountingIteratorHost
+TestCopyDeviceThrow
+TestCopyFromConstIterator
+TestCopyIf
+TestCopyIfSimpleDevice
+TestCopyIfSimpleHost
+TestCopyIfStencil
+TestCopyIfStencilSimpleDevice
+TestCopyIfStencilSimpleHost
+TestCopyListToDevice
+TestCopyListToHost
+TestCopyMatchingTypesDevice
+TestCopyMatchingTypesHost
+TestCopyMixedTypesDevice
+TestCopyMixedTypesHost
+TestCopyNConstantIteratorToZipIteratorDevice
+TestCopyNConstantIteratorToZipIteratorHost
+TestCopyNCountingIteratorDevice
+TestCopyNCountingIteratorHost
+TestCopyNFromConstIterator
+TestCopyNListToDevice
+TestCopyNListToHost
+TestCopyNMatchingTypesDevice
+TestCopyNMatchingTypesHost
+TestCopyNMixedTypesDevice
+TestCopyNMixedTypesHost
+TestCopyNToDiscardIterator
+TestCopyNVectorBool
+TestCopyNZipIteratorDevice
+TestCopyNZipIteratorHost
+TestCopyToDiscardIterator
+TestCopyToDiscardIteratorZipped
+TestCopyVectorBool
+TestCopyZipIteratorDevice
+TestCopyZipIteratorHost
+TestCount
+TestCountFromConstIteratorSimpleDevice
+TestCountFromConstIteratorSimpleHost
+TestCountIf
+TestCountIfSimpleDevice
+TestCountIfSimpleHost
+TestCountSimpleDevice
+TestCountSimpleHost
+TestCountingIteratorComparison
+TestCountingIteratorCopyConstructor
+TestCountingIteratorDifference
+TestCountingIteratorDistance
+TestCountingIteratorIncrement
+TestCountingIteratorLowerBound
+TestCountingIteratorUnsignedType
+TestDeviceDeleteDestructorInvocation
+TestDeviceDereferenceCountingIterator
+TestDeviceDereferenceDevicePtr
+TestDeviceDereferenceDeviceVectorIterator
+TestDeviceDereferenceTransformIterator
+TestDeviceDereferenceTransformedCountingIterator
+TestDevicePointerManipulation
+TestDeviceReferenceAssignmentFromDeviceReference
+TestDeviceReferenceConstructorFromDevicePointer
+TestDeviceReferenceConstructorFromDeviceReference
+TestDeviceReferenceManipulation
+TestDiscardIteratorComparison
+TestDiscardIteratorIncrement
+TestDistanceDevice
+TestDistanceHost
+TestDividesFunctionalDevice
+TestDividesFunctionalHost
+TestEqual
+TestEqualSimpleDevice
+TestEqualSimpleHost
+TestEqualToFunctionalDevice
+TestEqualToFunctionalHost
+TestExclusiveScan32
+TestExclusiveScanByKeySimpleDevice
+TestExclusiveScanByKeySimpleHost
+TestExclusiveScanNullPtr
+TestFill
+TestFillDiscardIterator
+TestFillMixedTypesDevice
+TestFillMixedTypesHost
+TestFillN
+TestFillNDiscardIterator
+TestFillNMixedTypesDevice
+TestFillNMixedTypesHost
+TestFillNSimpleDevice
+TestFillNSimpleHost
+TestFillSimpleDevice
+TestFillSimpleHost
+TestFillTuple
+TestFillWithNonTrivialAssignment
+TestFillWithTrivialAssignment
+TestFillZipIteratorDevice
+TestFillZipIteratorHost
+TestFind
+TestFindIf
+TestFindIfNot
+TestFindIfNotSimpleDevice
+TestFindIfNotSimpleHost
+TestFindIfSimpleDevice
+TestFindIfSimpleHost
+TestFindSimpleDevice
+TestFindSimpleHost
+TestForEach
+TestForEachLargeRegisterFootprint
+TestForEachSimpleAnySpace
+TestForEachSimpleDevice
+TestForEachSimpleHost
+TestForEachWithLargeTypes
+TestFunctionalPlaceholdersBinaryEqualToDevice
+TestFunctionalPlaceholdersBinaryEqualToHost
+TestFunctionalPlaceholdersBinaryGreaterDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualDevice
+TestFunctionalPlaceholdersBinaryGreaterEqualHost
+TestFunctionalPlaceholdersBinaryGreaterHost
+TestFunctionalPlaceholdersBinaryLessDevice
+TestFunctionalPlaceholdersBinaryLessEqualDevice
+TestFunctionalPlaceholdersBinaryLessEqualHost
+TestFunctionalPlaceholdersBinaryLessHost
+TestFunctionalPlaceholdersBinaryNotEqualToDevice
+TestFunctionalPlaceholdersBinaryNotEqualToHost
+TestFunctionalPlaceholdersBitAnd<thrust::device_vector>
+TestFunctionalPlaceholdersBitAnd<thrust::host_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitAndEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitNegateDevice
+TestFunctionalPlaceholdersBitNegateHost
+TestFunctionalPlaceholdersBitOr<thrust::device_vector>
+TestFunctionalPlaceholdersBitOr<thrust::host_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitOrEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitRshiftEqual<thrust::host_vector>
+TestFunctionalPlaceholdersBitXor<thrust::device_vector>
+TestFunctionalPlaceholdersBitXor<thrust::host_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::device_vector>
+TestFunctionalPlaceholdersBitXorEqual<thrust::host_vector>
+TestFunctionalPlaceholdersDivides<thrust::device_vector>
+TestFunctionalPlaceholdersDivides<thrust::host_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersDividesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersLogicalAndDevice
+TestFunctionalPlaceholdersLogicalAndHost
+TestFunctionalPlaceholdersLogicalNotDevice
+TestFunctionalPlaceholdersLogicalNotHost
+TestFunctionalPlaceholdersLogicalOrDevice
+TestFunctionalPlaceholdersLogicalOrHost
+TestFunctionalPlaceholdersMinus<thrust::device_vector>
+TestFunctionalPlaceholdersMinus<thrust::host_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMinusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersModulus<thrust::device_vector>
+TestFunctionalPlaceholdersModulus<thrust::host_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersModulusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::device_vector>
+TestFunctionalPlaceholdersMultiplies<thrust::host_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::device_vector>
+TestFunctionalPlaceholdersMultipliesEqual<thrust::host_vector>
+TestFunctionalPlaceholdersNegateDevice
+TestFunctionalPlaceholdersNegateHost
+TestFunctionalPlaceholdersPlus<thrust::device_vector>
+TestFunctionalPlaceholdersPlus<thrust::host_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::device_vector>
+TestFunctionalPlaceholdersPlusEqual<thrust::host_vector>
+TestFunctionalPlaceholdersPrefixDecrementDevice
+TestFunctionalPlaceholdersPrefixDecrementHost
+TestFunctionalPlaceholdersPrefixIncrementDevice
+TestFunctionalPlaceholdersPrefixIncrementHost
+TestFunctionalPlaceholdersSuffixDecrementDevice
+TestFunctionalPlaceholdersSuffixDecrementHost
+TestFunctionalPlaceholdersSuffixIncrementDevice
+TestFunctionalPlaceholdersSuffixIncrementHost
+TestFunctionalPlaceholdersTransformIterator<thrust::device_vector>
+TestFunctionalPlaceholdersTransformIterator<thrust::host_vector>
+TestFunctionalPlaceholdersUnaryPlusDevice
+TestFunctionalPlaceholdersUnaryPlusHost
+TestFunctionalPlaceholdersValue<thrust::device_vector>
+TestFunctionalPlaceholdersValue<thrust::host_vector>
+TestGather
+TestGatherCountingIteratorDevice
+TestGatherCountingIteratorHost
+TestGatherIf
+TestGatherIfSimpleDevice
+TestGatherIfSimpleHost
+TestGatherIfToDiscardIterator
+TestGatherSimpleDevice
+TestGatherSimpleHost
+TestGatherToDiscardIterator
+TestGenerate
+TestGenerateNSimpleDevice
+TestGenerateNSimpleHost
+TestGenerateNToDiscardIterator
+TestGenerateSimpleDevice
+TestGenerateSimpleHost
+TestGenerateToDiscardIterator
+TestGenerateTuple
+TestGenerateZipIteratorDevice
+TestGenerateZipIteratorHost
+TestGreaterEqualFunctionalDevice
+TestGreaterEqualFunctionalHost
+TestGreaterFunctionalDevice
+TestGreaterFunctionalHost
+TestIdentityFunctionalDevice
+TestIdentityFunctionalHost
+TestInclusiveScan32
+TestInclusiveScanByKeySimpleDevice
+TestInclusiveScanByKeySimpleHost
+TestInclusiveScanByKeyTransformIteratorDevice
+TestInclusiveScanByKeyTransformIteratorHost
+TestInclusiveScanWithIndirectionDevice
+TestInclusiveScanWithIndirectionHost
+TestInnerProduct
+TestInnerProductSimpleDevice
+TestInnerProductSimpleHost
+TestInnerProductWithOperatorDevice
+TestInnerProductWithOperatorHost
+TestIsCommutative
+TestIsPartitionedDevice
+TestIsPartitionedHost
+TestIsPartitionedSimpleDevice
+TestIsPartitionedSimpleHost
+TestIsPlainOldData
+TestIsSortedDevice
+TestIsSortedHost
+TestIsSortedRepeatedElementsDevice
+TestIsSortedRepeatedElementsHost
+TestIsSortedSimpleDevice
+TestIsSortedSimpleHost
+TestIsSortedUntilDevice
+TestIsSortedUntilHost
+TestIsSortedUntilRepeatedElementsDevice
+TestIsSortedUntilRepeatedElementsHost
+TestIsSortedUntilSimpleDevice
+TestIsSortedUntilSimpleHost
+TestIsTrivialIterator
+TestLessEqualFunctionalDevice
+TestLessEqualFunctionalHost
+TestLessFunctionalDevice
+TestLessFunctionalHost
+TestLog2
+TestLogicalAndFunctionalDevice
+TestLogicalAndFunctionalHost
+TestLogicalNotFunctionalDevice
+TestLogicalNotFunctionalHost
+TestLogicalOrFunctionalDevice
+TestLogicalOrFunctionalHost
+TestMakeConstantIterator
+TestMakeDevicePointer
+TestMakeDiscardIterator
+TestMakePermutationIteratorDevice
+TestMakePermutationIteratorHost
+TestMakeTransformIteratorDevice
+TestMakeTransformIteratorHost
+TestMakeTuple
+TestMax
+TestMaxActiveBlocks
+TestMaxBlocksize
+TestMaxBlocksizeWithHighestOccupancy
+TestMaxElement
+TestMaxElementSimpleDevice
+TestMaxElementSimpleHost
+TestMaximumFunctionalDevice
+TestMaximumFunctionalHost
+TestMerge
+TestMergeDescending
+TestMergeKeyValue
+TestMergeKeyValueDescending
+TestMergeSimpleDevice
+TestMergeSimpleHost
+TestMergeSortAscendingKey
+TestMergeSortAscendingKeyValue
+TestMergeSortDescendingKey
+TestMergeSortDescendingKeyValue
+TestMergeSortKeySimple
+TestMergeSortKeyValueSimple
+TestMergeSortStableKeySimple
+TestMergeToDiscardIterator
+TestMin
+TestMinElement
+TestMinElementSimpleDevice
+TestMinElementSimpleHost
+TestMinMaxElement
+TestMinMaxElementSimpleDevice
+TestMinMaxElementSimpleHost
+TestMinimumFunctionalDevice
+TestMinimumFunctionalHost
+TestMinstdRand0Equal
+TestMinstdRand0Max
+TestMinstdRand0Min
+TestMinstdRand0SaveRestore
+TestMinstdRand0Unequal
+TestMinstdRand0Validation
+TestMinstdRandEqual
+TestMinstdRandMax
+TestMinstdRandMin
+TestMinstdRandSaveRestore
+TestMinstdRandUnequal
+TestMinstdRandValidation
+TestMinusFunctionalDevice
+TestMinusFunctionalHost
+TestMismatchSimpleDevice
+TestMismatchSimpleHost
+TestModulusFunctionalDevice
+TestModulusFunctionalHost
+TestMultipliesFunctionalDevice
+TestMultipliesFunctionalHost
+TestNegateFunctionalDevice
+TestNegateFunctionalHost
+TestNoneOfDevice
+TestNoneOfHost
+TestNot1Device
+TestNot1Host
+TestNot2Device
+TestNot2Host
+TestNotEqualToFunctionalDevice
+TestNotEqualToFunctionalHost
+TestNullPtrDereferenceYieldsError
+TestPairComparison
+TestPairGet
+TestPairManipulation
+TestPairReduce
+TestPairScan
+TestPairScanByKey
+TestPairStableSort
+TestPairStableSortByKey
+TestPairTransform
+TestPairTupleElement
+TestPairTupleSize
+TestPartition
+TestPartitionCopy
+TestPartitionCopySimpleDevice
+TestPartitionCopySimpleHost
+TestPartitionCopyToDiscardIterator
+TestPartitionPointDevice
+TestPartitionPointHost
+TestPartitionPointSimpleDevice
+TestPartitionPointSimpleHost
+TestPartitionSimpleDevice
+TestPartitionSimpleHost
+TestPartitionZipIteratorDevice
+TestPartitionZipIteratorHost
+TestPermutationIteratorGatherDevice
+TestPermutationIteratorGatherHost
+TestPermutationIteratorHostDeviceGather
+TestPermutationIteratorHostDeviceScatter
+TestPermutationIteratorReduceDevice
+TestPermutationIteratorReduceHost
+TestPermutationIteratorScatterDevice
+TestPermutationIteratorScatterHost
+TestPermutationIteratorSimpleDevice
+TestPermutationIteratorSimpleHost
+TestPermutationIteratorWithCountingIteratorDevice
+TestPermutationIteratorWithCountingIteratorHost
+TestPlusFunctionalDevice
+TestPlusFunctionalHost
+TestProject1stFunctionalDevice
+TestProject1stFunctionalHost
+TestProject2ndFunctionalDevice
+TestProject2ndFunctionalHost
+TestRadixSort
+TestRadixSortByKey
+TestRadixSortByKeyLongLongValues
+TestRadixSortByKeyShortValues
+TestRadixSortByKeyUnaligned
+TestRadixSortKeySimple<thrust::device_vector>
+TestRadixSortKeyValueSimple<thrust::device_vector>
+TestRanlux24BaseEqual
+TestRanlux24BaseMax
+TestRanlux24BaseMin
+TestRanlux24BaseSaveRestore
+TestRanlux24BaseUnequal
+TestRanlux24BaseValidation
+TestRanlux24Equal
+TestRanlux24Max
+TestRanlux24Min
+TestRanlux24SaveRestore
+TestRanlux24Unequal
+TestRanlux24Validation
+TestRanlux48BaseEqual
+TestRanlux48BaseMax
+TestRanlux48BaseMin
+TestRanlux48BaseSaveRestore
+TestRanlux48BaseUnequal
+TestRanlux48BaseValidation
+TestRanlux48Equal
+TestRanlux48Max
+TestRanlux48Min
+TestRanlux48SaveRestore
+TestRanlux48Unequal
+TestRanlux48Validation
+TestRawPointerCastDevice
+TestRawPointerCastHost
+TestReduce
+TestReduceByKey
+TestReduceByKeySimpleDevice
+TestReduceByKeySimpleHost
+TestReduceByKeyToDiscardIterator
+TestReduceIntervals
+TestReduceIntervalsSimpleDevice
+TestReduceIntervalsSimpleHost
+TestReduceMixedTypesDevice
+TestReduceMixedTypesHost
+TestReduceNullPtr
+TestReduceSimpleDevice
+TestReduceSimpleHost
+TestReduceWithIndirectionDevice
+TestReduceWithIndirectionHost
+TestReduceWithLargeTypes
+TestReduceWithOperator
+TestRemove
+TestRemoveCopy
+TestRemoveCopyIf
+TestRemoveCopyIfSimpleDevice
+TestRemoveCopyIfSimpleHost
+TestRemoveCopyIfStencil
+TestRemoveCopyIfStencilSimpleDevice
+TestRemoveCopyIfStencilSimpleHost
+TestRemoveCopyIfStencilToDiscardIterator
+TestRemoveCopyIfToDiscardIterator
+TestRemoveCopySimpleDevice
+TestRemoveCopySimpleHost
+TestRemoveCopyToDiscardIterator
+TestRemoveCopyToDiscardIteratorZipped
+TestRemoveIf
+TestRemoveIfSimpleDevice
+TestRemoveIfSimpleHost
+TestRemoveIfStencil
+TestRemoveIfStencilSimpleDevice
+TestRemoveIfStencilSimpleHost
+TestRemoveSimpleDevice
+TestRemoveSimpleHost
+TestReplace
+TestReplaceCopy
+TestReplaceCopyIf
+TestReplaceCopyIfSimpleDevice
+TestReplaceCopyIfSimpleHost
+TestReplaceCopyIfStencil
+TestReplaceCopyIfStencilSimpleDevice
+TestReplaceCopyIfStencilSimpleHost
+TestReplaceCopyIfStencilToDiscardIterator
+TestReplaceCopyIfToDiscardIterator
+TestReplaceCopySimpleDevice
+TestReplaceCopySimpleHost
+TestReplaceCopyToDiscardIterator
+TestReplaceIf
+TestReplaceIfSimpleDevice
+TestReplaceIfSimpleHost
+TestReplaceIfStencil
+TestReplaceIfStencilSimpleDevice
+TestReplaceIfStencilSimpleHost
+TestReplaceSimpleDevice
+TestReplaceSimpleHost
+TestReverse
+TestReverseCopy
+TestReverseCopySimpleDevice
+TestReverseCopySimpleHost
+TestReverseCopyToDiscardIterator
+TestReverseIteratorCopyConstructor
+TestReverseIteratorCopyDevice
+TestReverseIteratorCopyHost
+TestReverseIteratorExclusiveScan
+TestReverseIteratorExclusiveScanSimple
+TestReverseIteratorIncrement
+TestReverseSimpleDevice
+TestReverseSimpleHost
+TestScalarBinarySearchDescendingSimpleDevice
+TestScalarBinarySearchDescendingSimpleHost
+TestScalarBinarySearchSimpleDevice
+TestScalarBinarySearchSimpleHost
+TestScalarEqualRangeDescendingSimpleDevice
+TestScalarEqualRangeDescendingSimpleHost
+TestScalarEqualRangeSimpleDevice
+TestScalarEqualRangeSimpleHost
+TestScalarLowerBoundDescendingSimpleDevice
+TestScalarLowerBoundDescendingSimpleHost
+TestScalarLowerBoundSimpleDevice
+TestScalarLowerBoundSimpleHost
+TestScalarUpperBoundDescendingSimpleDevice
+TestScalarUpperBoundDescendingSimpleHost
+TestScalarUpperBoundSimpleDevice
+TestScalarUpperBoundSimpleHost
+TestScan
+TestScanByKeyHeadFlagsDevice
+TestScanByKeyHeadFlagsHost
+TestScanByKeyLargeInput
+TestScanByKeyMixedTypes
+TestScanByKeyReusedKeysDevice
+TestScanByKeyReusedKeysHost
+TestScanByKeyWithLargeTypes
+TestScanMixedTypes
+TestScanMixedTypesDevice
+TestScanMixedTypesHost
+TestScanSimpleDevice
+TestScanSimpleHost
+TestScanToDiscardIterator
+TestScanWithLargeTypes
+TestScanWithOperator
+TestScanWithOperatorToDiscardIterator
+TestScatter
+TestScatterCountingIteratorDevice
+TestScatterCountingIteratorHost
+TestScatterIf
+TestScatterIfCountingIteratorDevice
+TestScatterIfCountingIteratorHost
+TestScatterIfSimpleDevice
+TestScatterIfSimpleHost
+TestScatterIfToDiscardIterator
+TestScatterSimpleDevice
+TestScatterSimpleHost
+TestScatterToDiscardIterator
+TestSelect
+TestSelectKeyValue
+TestSelectSemantics
+TestSequence
+TestSequenceSimpleDevice
+TestSequenceSimpleHost
+TestSequenceToDiscardIterator
+TestSetDifference
+TestSetDifferenceDescending
+TestSetDifferenceDescendingSimpleDevice
+TestSetDifferenceDescendingSimpleHost
+TestSetDifferenceEquivalentRanges
+TestSetDifferenceKeyValue
+TestSetDifferenceMultiset
+TestSetDifferenceSimpleDevice
+TestSetDifferenceSimpleHost
+TestSetIntersection
+TestSetIntersectionDescending
+TestSetIntersectionDescendingSimpleDevice
+TestSetIntersectionDescendingSimpleHost
+TestSetIntersectionEquivalentRanges
+TestSetIntersectionKeyValue
+TestSetIntersectionMultiset
+TestSetIntersectionSimpleDevice
+TestSetIntersectionSimpleHost
+TestSetIntersectionToDiscardIterator
+TestSetSymmetricDifference
+TestSetSymmetricDifferenceDescending
+TestSetSymmetricDifferenceDescendingSimpleDevice
+TestSetSymmetricDifferenceDescendingSimpleHost
+TestSetSymmetricDifferenceEquivalentRanges
+TestSetSymmetricDifferenceKeyValue
+TestSetSymmetricDifferenceMultiset
+TestSetSymmetricDifferenceSimpleDevice
+TestSetSymmetricDifferenceSimpleHost
+TestSetUnion
+TestSetUnionDescending
+TestSetUnionKeyValue
+TestSetUnionKeyValueDescending
+TestSetUnionSimpleDevice
+TestSetUnionSimpleHost
+TestSetUnionToDiscardIterator
+TestSetUnionWithEquivalentElementsSimpleDevice
+TestSetUnionWithEquivalentElementsSimpleHost
+TestSortAscendingKey
+TestSortAscendingKeyValue
+TestSortByKeySimpleDevice
+TestSortByKeySimpleHost
+TestSortByKeyVariableBits
+TestSortDescendingKey
+TestSortDescendingKeyValue
+TestSortNullPtr
+TestSortSimpleDevice
+TestSortSimpleHost
+TestSortVariableBits
+TestStablePartition
+TestStablePartitionCopy
+TestStablePartitionCopySimpleDevice
+TestStablePartitionCopySimpleHost
+TestStablePartitionCopyToDiscardIterator
+TestStablePartitionSimpleDevice
+TestStablePartitionSimpleHost
+TestStablePartitionZipIteratorDevice
+TestStablePartitionZipIteratorHost
+TestStableSort
+TestStableSortByKey
+TestStableSortByKeySemantics
+TestStableSortByKeySimpleDevice
+TestStableSortByKeySimpleHost
+TestStableSortByKeyWithLargeKeys
+TestStableSortByKeyWithLargeKeysAndValues
+TestStableSortByKeyWithLargeValues
+TestStableSortSemantics
+TestStableSortSimpleDevice
+TestStableSortSimpleHost
+TestStableSortWithIndirectionDevice
+TestStableSortWithIndirectionHost
+TestStableSortWithLargeKeys
+TestStandardIntegerTypes
+TestSwapRanges
+TestSwapRangesSimpleDevice
+TestSwapRangesSimpleHost
+TestSwapRangesUserSwap
+TestTaus88Equal
+TestTaus88Max
+TestTaus88Min
+TestTaus88SaveRestore
+TestTaus88Unequal
+TestTaus88Validation
+TestTransformBinary
+TestTransformBinaryCountingIteratorDevice
+TestTransformBinaryCountingIteratorHost
+TestTransformBinarySimpleDevice
+TestTransformBinarySimpleHost
+TestTransformBinaryToDiscardIterator
+TestTransformIfBinary
+TestTransformIfBinarySimpleDevice
+TestTransformIfBinarySimpleHost
+TestTransformIfBinaryToDiscardIterator
+TestTransformIfUnary
+TestTransformIfUnaryNoStencil
+TestTransformIfUnaryNoStencilSimpleDevice
+TestTransformIfUnaryNoStencilSimpleHost
+TestTransformIfUnarySimpleDevice
+TestTransformIfUnarySimpleHost
+TestTransformIfUnaryToDiscardIterator
+TestTransformIteratorDevice
+TestTransformIteratorHost
+TestTransformIteratorReduce
+TestTransformNullPtr
+TestTransformReduce
+TestTransformReduceCountingIteratorDevice
+TestTransformReduceCountingIteratorHost
+TestTransformReduceFromConst
+TestTransformReduceSimpleDevice
+TestTransformReduceSimpleHost
+TestTransformScan
+TestTransformScanCountingIteratorDevice
+TestTransformScanCountingIteratorHost
+TestTransformScanSimpleDevice
+TestTransformScanSimpleHost
+TestTransformScanToDiscardIterator
+TestTransformUnary
+TestTransformUnaryCountingIteratorDevice
+TestTransformUnaryCountingIteratorHost
+TestTransformUnarySimpleDevice
+TestTransformUnarySimpleHost
+TestTransformUnaryToDiscardIterator
+TestTransformUnaryToDiscardIteratorZipped
+TestTransformWithIndirectionDevice
+TestTransformWithIndirectionHost
+TestTrivialSequenceDevice
+TestTrivialSequenceHost
+TestTupleComparison
+TestTupleConstructor
+TestTupleGet
+TestTupleReduce
+TestTupleScan
+TestTupleStableSort
+TestTupleTie
+TestTupleTransform
+TestTypeName
+TestUniformDecomposition
+TestUniformIntDistributionMax
+TestUniformIntDistributionMin
+TestUniformIntDistributionSaveRestore
+TestUniformRealDistributionMax
+TestUniformRealDistributionMin
+TestUniformRealDistributionSaveRestore
+TestUninitializedCopyNonPODDevice
+TestUninitializedCopyNonPODHost
+TestUninitializedCopySimplePODDevice
+TestUninitializedCopySimplePODHost
+TestUninitializedFillNNonPOD
+TestUninitializedFillNPODDevice
+TestUninitializedFillNPODHost
+TestUninitializedFillNonPOD
+TestUninitializedFillPODDevice
+TestUninitializedFillPODHost
+TestUnique
+TestUniqueByKey
+TestUniqueByKeySimpleDevice
+TestUniqueByKeySimpleHost
+TestUniqueCopy
+TestUniqueCopyByKey
+TestUniqueCopyByKeySimpleDevice
+TestUniqueCopyByKeySimpleHost
+TestUniqueCopyByKeyToDiscardIterator
+TestUniqueCopySimpleDevice
+TestUniqueCopySimpleHost
+TestUniqueCopyToDiscardIterator
+TestUniqueSimpleDevice
+TestUniqueSimpleHost
+TestUnknownDeviceRobustness
+TestVectorAssignFromBiDirectionalIteratorDevice
+TestVectorAssignFromBiDirectionalIteratorHost
+TestVectorAssignFromDeviceVectorDevice
+TestVectorAssignFromDeviceVectorHost
+TestVectorAssignFromHostVectorDevice
+TestVectorAssignFromHostVectorHost
+TestVectorAssignFromSTLVectorDevice
+TestVectorAssignFromSTLVectorHost
+TestVectorBinarySearch
+TestVectorBinarySearchDescending
+TestVectorBinarySearchDescendingSimpleDevice
+TestVectorBinarySearchDescendingSimpleHost
+TestVectorBinarySearchDiscardIterator
+TestVectorBinarySearchSimpleDevice
+TestVectorBinarySearchSimpleHost
+TestVectorBool
+TestVectorContainingLargeType
+TestVectorCppZeroSizeDevice
+TestVectorCppZeroSizeHost
+TestVectorDataDevice
+TestVectorDataHost
+TestVectorElementAssignmentDevice
+TestVectorElementAssignmentHost
+TestVectorEquality
+TestVectorErasePositionDevice
+TestVectorErasePositionHost
+TestVectorEraseRangeDevice
+TestVectorEraseRangeHost
+TestVectorFillAssignDevice
+TestVectorFillAssignHost
+TestVectorFillInsert
+TestVectorFillInsertSimple<thrust::device_vector>
+TestVectorFillInsertSimple<thrust::host_vector>
+TestVectorFromBiDirectionalIteratorDevice
+TestVectorFromBiDirectionalIteratorHost
+TestVectorFromSTLVectorDevice
+TestVectorFromSTLVectorHost
+TestVectorFrontBackDevice
+TestVectorFrontBackHost
+TestVectorInequality
+TestVectorLowerBound
+TestVectorLowerBoundDescending
+TestVectorLowerBoundDescendingSimpleDevice
+TestVectorLowerBoundDescendingSimpleHost
+TestVectorLowerBoundDiscardIterator
+TestVectorLowerBoundSimpleDevice
+TestVectorLowerBoundSimpleHost
+TestVectorManipulationDevice
+TestVectorManipulationHost
+TestVectorRangeInsert
+TestVectorRangeInsertSimple<thrust::device_vector>
+TestVectorRangeInsertSimple<thrust::host_vector>
+TestVectorReservingDevice
+TestVectorReservingHost
+TestVectorResizingDevice
+TestVectorResizingHost
+TestVectorReversedDevice
+TestVectorReversedHost
+TestVectorShrinkToFitDevice
+TestVectorShrinkToFitHost
+TestVectorSwapDevice
+TestVectorSwapHost
+TestVectorToAndFromDeviceVectorDevice
+TestVectorToAndFromDeviceVectorHost
+TestVectorToAndFromHostVectorDevice
+TestVectorToAndFromHostVectorHost
+TestVectorUpperBound
+TestVectorUpperBoundDescending
+TestVectorUpperBoundDescendingSimpleDevice
+TestVectorUpperBoundDescendingSimpleHost
+TestVectorUpperBoundDiscardIterator
+TestVectorUpperBoundSimpleDevice
+TestVectorUpperBoundSimpleHost
+TestVectorWithInitialValueDevice
+TestVectorWithInitialValueHost
+TestVectorZeroSizeDevice
+TestVectorZeroSizeHost
+TestZipIteratorCopyAoSToSoA
+TestZipIteratorCopyDevice
+TestZipIteratorCopyHost
+TestZipIteratorCopySoAToAoS
+TestZipIteratorManipulation
+TestZipIteratorReduce
+TestZipIteratorReduceByKey
+TestZipIteratorReference
+TestZipIteratorScan
+TestZipIteratorSpace
+TestZipIteratorStableSort
+TestZipIteratorStableSortByKey
+TestZipIteratorTransform
+TestZipIteratorTraversal
+TestZippedDiscardIterator
diff --git a/internal/test/warningstester.cu b/internal/test/warningstester.cu
new file mode 100644
index 000000000..77c2947ac
--- /dev/null
+++ b/internal/test/warningstester.cu
@@ -0,0 +1,8 @@
+//#include "cuda_runtime_api.h"
+#include "warningstester.h"
+
+int main()
+{
+  return 0;
+}
+
diff --git a/performance/SConscript b/performance/SConscript
deleted file mode 100644
index ed8db553a..000000000
--- a/performance/SConscript
+++ /dev/null
@@ -1,63 +0,0 @@
-import sys
-
-# enable python to find the module
-module_path = Dir('.').srcnode().abspath
-sys.path.append(module_path)
-from build.perftest import compile_test
-
-import os
-
-Import('env')
-my_env = env.Clone()
-
-def cu_build_function(source, target, env):
-  compile_test(str(source[0]), str(target[0]))
-
-# define a rule to build a .cu from a .test
-cu_builder = Builder(action = cu_build_function,
-                     suffix = '.cu',
-                     src_suffix = '.test')
-my_env.Append(BUILDERS = {'CUFile' : cu_builder})
-
-# define a rule to build a report from an executable
-xml_builder = Builder(action = os.path.join('"' + str(my_env.Dir('.')), '$SOURCE" > $TARGET'),
-                      suffix = '.xml',
-                      src_suffix = my_env['PROGSUFFIX'])
-my_env.Append(BUILDERS = {'XMLFile' : xml_builder})
-
-my_env.Append(CPPPATH = [Dir('.').srcnode(), Dir('#/testing')])
-
-cu_list = []
-program_list = []
-xml_list = []
-
-build_files = [os.path.join('build', f) for f in ['perftest.py', 'test_function_template.cxx']]
-
-# describe dependency graph:
-# xml -> program -> .cu -> .test
-for test in my_env.Glob('*.test'):
-  cu = my_env.CUFile(test)
-  my_env.Depends(cu, build_files)
-  cu_list.append(cu)
-
-  prog = my_env.Program(cu)
-  program_list.append(prog)
-
-  xml = my_env.XMLFile(prog)
-  xml_list.append(xml)
-
-# make aliases for groups of targets
-run_performance_tests_alias = my_env.Alias("run_performance_tests", xml_list)
-performance_tests_alias     = my_env.Alias("performance_tests", program_list)
-
-# when no build target is specified, by default we build the programs
-my_env.Default(performance_tests_alias)
-
-# output a help message
-my_env.Help("""
-Type: 'scons' to build all performance test programs.
-Type: 'scons run_performance_tests' to run all performance tests and output reports.
-Type: 'scons <test name>' to build a single performance test program of interest.
-Type: 'scons <test name>.xml' to run a single performance test of interest and output a report in an XML file.
-""")
-
diff --git a/performance/adjacent_difference.test b/performance/adjacent_difference.test
deleted file mode 100644
index 819a5562d..000000000
--- a/performance/adjacent_difference.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/adjacent_difference.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_integers<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_input = h_input;
-    
-    thrust::host_vector<$InputType>   h_output($InputSize);
-    thrust::device_vector<$InputType> d_output($InputSize);
-
-    thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
-    thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin());
-
-    ASSERT_EQUAL(h_output, d_output);
-    """
-
-TIME = \
-    """
-    thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(2*sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['int']
-InputSizes = [2**24]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/axpy.test b/performance/axpy.test
deleted file mode 100644
index 9534ae932..000000000
--- a/performance/axpy.test
+++ /dev/null
@@ -1,84 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/transform.h>
-    #include <thrust/functional.h>
-    
-    //#include <cublas.h>
-    
-    #include <cmath>
-
-    template <typename T>
-    struct axpy
-    {
-        T a;
-
-        axpy(T a) : a(a) {}
-
-        __host__ __device__
-        T operator()(T x, T y) const
-        {
-            return a * x + y;
-        }
-    };
-    
-    template <typename Vector>
-    void axpy_fast(const typename Vector::value_type a, const Vector& x, Vector& y)
-    {
-        typedef typename Vector::value_type T;
-        thrust::transform(x.begin(), x.end(), y.begin(), y.begin(), axpy<T>(a));
-    }
-    
-    template <typename Vector>
-    void axpy_slow(const typename Vector::value_type a, const Vector& x, Vector& y)
-    {
-        typedef typename Vector::value_type T;
-
-        // temp <- a
-        Vector temp(x.size(), a);
-   
-        // temp <- a * x
-        thrust::transform(x.begin(), x.end(), temp.begin(), temp.begin(), thrust::multiplies<float>());
-
-        // y <- a * x + y
-        thrust::transform(temp.begin(), temp.end(), y.begin(), y.begin(), thrust::plus<float>());
-    }
-    
-
-    """
-
-INITIALIZE = \
-    """
-    //cublasInit();
-
-    thrust::host_vector<$InputType>   h_x = unittest::random_samples<$InputType>($InputSize);
-    thrust::host_vector<$InputType>   h_y = unittest::random_samples<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_x = h_x;
-    thrust::device_vector<$InputType> d_y = h_y;
-
-    $InputType a = 2.0;
-
-    $Method(a, h_x, h_y);
-    $Method(a, d_x, d_y);
-
-    ASSERT_EQUAL(h_x, d_x);
-    ASSERT_EQUAL(h_y, d_y);
-    """
-
-TIME = \
-    """
-    $Method(a, d_x, d_y);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(2 * double($InputSize));
-    RECORD_BANDWIDTH(3* sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['float', 'double']
-InputSizes = [2**24]
-Methods    = ['axpy_fast', 'axpy_slow']
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes), ('Method', Methods)]
-
diff --git a/performance/binary_search.test b/performance/binary_search.test
deleted file mode 100644
index cd0a22993..000000000
--- a/performance/binary_search.test
+++ /dev/null
@@ -1,45 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    #include <thrust/binary_search.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    
-    thrust::sort(h_keys.begin(), h_keys.end());
-    thrust::sort(d_keys.begin(), d_keys.end());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-
-    thrust::host_vector<$KeyType>   h_search = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_search = h_search;
-    
-    thrust::host_vector<unsigned int>    h_output($InputSize);
-    thrust::device_vector<unsigned int>  d_output($InputSize);
-
-    thrust::binary_search(h_keys.begin(), h_keys.end(), h_search.begin(), h_search.end(), h_output.begin());
-    thrust::binary_search(d_keys.begin(), d_keys.end(), d_search.begin(), d_search.end(), d_output.begin());
-
-    ASSERT_EQUAL(d_output, h_output);
-    """
-
-TIME = \
-    """
-    thrust::binary_search(d_keys.begin(), d_keys.end(), d_search.begin(), d_search.end(), d_output.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    """
-
-
-KeyTypes   = ['int']
-InputSizes = [2**24]
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/build/__init__.py b/performance/build/__init__.py
deleted file mode 100644
index bd5c0d75a..000000000
--- a/performance/build/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from perftest import *
-from testsuite import *
-from report import *
diff --git a/performance/build/perftest.h b/performance/build/perftest.h
deleted file mode 100644
index 852e30a53..000000000
--- a/performance/build/perftest.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <unittest/unittest.h>
-#include <build/timer.h>
-#include <string>
-#include <algorithm>
-
-
-//#include <cuda_runtime.h>
-//#include <cuda.h>
-
-#define RECORD_RESULT(name, value, units)   { std::cout << "  <result  name=\"" << name << "\"  value=\"" << value  << "\"  units=\"" << units << "\"/>" << std::endl; }
-#define RECORD_TIME()                       RECORD_RESULT("Time", best_time, "seconds")
-#define RECORD_RATE(name, value, units)     RECORD_RESULT(name, (double(value)/best_time), units)
-#define RECORD_BANDWIDTH(bytes)             RECORD_RATE("Bandwidth", double(bytes) / 1e9, "GBytes/s")
-#define RECORD_THROUGHPUT(value)            RECORD_RATE("Throughput", double(value) / 1e9, "GOp/s")
-#define RECORD_SORTING_RATE(size)           RECORD_RATE("Sorting", double(size) / 1e6, "MKeys/s")
-#define RECORD_VARIABLE(name, value)        { std::cout << "  <variable  name=\"" << name << "\"  value=\"" << value << "\"/>" << std::endl; }
-#define RECORD_TEST_STATUS(result, message) { std::cout << "  <status  result=\"" << result  << "\"  message=\"" << message << "\"/>" << std::endl; }
-#define RECORD_TEST_SUCCESS()               RECORD_TEST_STATUS("Success",  "")
-#define RECORD_TEST_FAILURE(message)        RECORD_TEST_STATUS("Failure",  message)
-#define BEGIN_TEST(name)                    { std::cout << "<test name=\"" << name << "\">" << std::endl; }
-#define END_TEST()                          { std::cout << "</test>" << std::endl; }
-#define BEGIN_TESTSUITE(name)               { std::cout << "<?xml version=\"1.0\" ?>" << std::endl << "<testsuite  name=\"" << name << "\">" << std::endl; }
-#define END_TESTSUITE()                     { std::cout << "</testsuite>" << std::endl; }
-
-
-#if defined(__GNUC__)  // GCC
-#define __HOST_COMPILER_NAME__ "GCC"
-# if defined(__GNUC_PATCHLEVEL__)
-#define __HOST_COMPILER_VERSION__ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-# else
-#define __HOST_COMPILER_VERSION__ (__GNUC__ * 10000 + __GNUC_MINOR__ * 100)
-# endif
-#elif defined(_MSC_VER) // Microsoft Visual C++
-#define __HOST_COMPILER_NAME__ "MSVC"
-#define __HOST_COMPILER_VERSION__  _MSC_VER
-#elif defined(__INTEL_COMPILER) // Intel Compiler
-#define __HOST_COMPILER_NAME__ "ICC"
-#define __HOST_COMPILER_VERSION__  __INTEL_COMPILER 
-#else // Unknown
-#define __HOST_COMPILER_NAME__ "UNKNOWN"
-#define __HOST_COMPILER_VERSION__ 0
-#endif
-
-
-inline void RECORD_PLATFORM_INFO(void)
-{
-#if THRUST_DEVICE_SYSTEM==THRUST_DEVICE_SYSTEM_CUDA
-    int deviceCount;
-    cudaGetDeviceCount(&deviceCount);
-    if (deviceCount == 0){
-        std::cerr << "There is no device supporting CUDA" << std::endl;
-        exit(1);
-    }
-
-    int dev;
-    cudaGetDevice(&dev);
-    cudaDeviceProp deviceProp;
-    cudaGetDeviceProperties(&deviceProp, dev);
-
-    if (dev == 0 && deviceProp.major == 9999 && deviceProp.minor == 9999){
-        std::cerr << "There is no device supporting CUDA" << std::endl;
-        exit(1);
-    }
-
-    std::cout << "<platform>" << std::endl;
-    std::cout << "  <device name=\"" << deviceProp.name << "\">" << std::endl;
-    std::cout << "    <property name=\"revision\"" << " " << "value=\"" << deviceProp.major << "." << deviceProp.minor << "\"/>" << std::endl;
-    std::cout << "    <property name=\"global memory\"" << " " << "value=\"" << deviceProp.totalGlobalMem << "\"  units=\"bytes\"/>" << std::endl;
-    std::cout << "    <property name=\"multiprocessors\"" << " " << "value=\"" << deviceProp.multiProcessorCount << "\"/>" << std::endl;
-    std::cout << "    <property name=\"cores\"" << " " << "value=\"" << 8*deviceProp.multiProcessorCount << "\"/>" << std::endl;
-    std::cout << "    <property name=\"constant memory\"" << " " << "value=\"" << deviceProp.totalConstMem << "\"  units=\"bytes\"/>" << std::endl;
-    std::cout << "    <property name=\"shared memory per block\"" << " " << "value=\"" << deviceProp.sharedMemPerBlock << "\"  units=\"bytes\"/>" << std::endl;
-    std::cout << "    <property name=\"warp size\"" << " " << "value=\"" << deviceProp.warpSize << "\"/>" << std::endl;
-    std::cout << "    <property name=\"max threads per block\"" << " " << "value=\"" << deviceProp.maxThreadsPerBlock << "\"/>" << std::endl;
-    std::cout << "    <property name=\"clock rate\"" << " " << "value=\"" << (deviceProp.clockRate * 1e-6f) << "\"  units=\"GHz\"/>" << std::endl;
-    std::cout << "  </device>" << std::endl;
-    std::cout << "  <compilation>" << std::endl;
-    std::cout << "    <property name=\"CUDA_VERSION\" value=\"" << CUDA_VERSION << "\"/>" << std::endl;
-    std::cout << "    <property name=\"host compiler\" value=\"" << __HOST_COMPILER_NAME__ << " " << __HOST_COMPILER_VERSION__ << "\"/>" << std::endl;
-    std::cout << "    <property name=\"__DATE__\" value=\"" << __DATE__ << "\"/>" << std::endl;
-    std::cout << "    <property name=\"__TIME__\" value=\"" << __TIME__ << "\"/>" << std::endl;
-    std::cout << "  </compilation>" << std::endl;
-    std::cout << "</platform>" << std::endl;
-#endif
-}
-
-
-inline void PROCESS_ARGUMENTS(int argc, char **argv)
-{
-  for(int i = 1; i < argc; ++i)
-  {
-    if(std::string(argv[i]) == "--device")
-    {
-      ++i;
-      if(i == argc)
-      {
-        std::cerr << "usage: --device n" << std::endl;
-        exit(-1);
-      }
-
-#if THRUST_DEVICE_SYSTEM==THRUST_DEVICE_SYSTEM_CUDA
-      int device_index = atoi(argv[i]);
-      cudaSetDevice(device_index);
-#endif
-    }
-  }
-}
-
-
diff --git a/performance/build/perftest.py b/performance/build/perftest.py
deleted file mode 100644
index b7dfe3b32..000000000
--- a/performance/build/perftest.py
+++ /dev/null
@@ -1,156 +0,0 @@
-def product(*iterables):
-    """compute the cartesian product of a list of iterables
-    >>> for i in product(['a','b','c'],[1,2]):
-    ...     print i
-    ... 
-    ['a', 1]
-    ['a', 2]
-    ['b', 1]
-    ['b', 2]
-    ['c', 1]
-    ['c', 2]
-    """
-
-    if iterables:
-        for head in iterables[0]:
-            for remainder in product(*iterables[1:]):
-                yield [head] + remainder
-    else:
-        yield []
-
-
-####
-# Function generators
-def make_test_function_template(INITIALIZE, TIME, FINALIZE):
-    import string
-    import os
-
-    function_template_file = os.path.join( os.path.split(__file__)[0], 'test_function_template.cxx')
-
-    # test_function_template has locations for $PREAMBLE $INITIALIZE etc.
-    test_template = string.Template(open(function_template_file).read())
-
-    sections = {'INITIALIZE' : INITIALIZE,
-                'TIME' : TIME,
-                'FINALIZE' : FINALIZE}
-
-    # skeleton has supplied definitions for $INCLUDE and $PREAMBLE
-    # and has locations for $InputType and $InputSize etc.
-    skeleton = test_template.safe_substitute(sections)
-    
-    return string.Template(skeleton)
-
-def make_test_function(fname, TestVariablePairs, ftemplate):
-    VariableDescription = '\n'.join(['RECORD_VARIABLE("%s","%s");' % pair for pair in TestVariablePairs])
-
-    fmap = dict(TestVariablePairs)               
-    fmap['DESCRIPTION'] = VariableDescription
-    fmap['FUNCTION']    = fname
-            
-    return ftemplate.substitute(fmap)
-
-def generate_functions(pname, TestVariables, INITIALIZE, TIME, FINALIZE):
-    ftemplate = make_test_function_template(INITIALIZE, TIME, FINALIZE)
-
-    TestVariableNames  = [ pair[0] for pair in TestVariables]
-    TestVariableRanges = [ pair[1] for pair in TestVariables]
-
-    for n,values in enumerate(product(*TestVariableRanges)):
-        converted_values = []
-        for v in values:
-            v = str(v)
-            v = v.replace(" ","_")  # C++ tokens we don't want
-            v = v.replace(".","_")
-            v = v.replace("<","_")
-            v = v.replace(">","_")
-            v = v.replace(",","_")
-            v = v.replace(":","_")
-            converted_values.append(v)
-
-        fname = '_'.join( [pname] + converted_values )
-        TestVariablePairs = zip(TestVariableNames, values)
-        yield (fname, make_test_function(fname, TestVariablePairs, ftemplate))
-
-
-####
-# Program generators
-def make_test_program(pname, functions, PREAMBLE = ""):
-    parts = []
-    parts.append("#include <build/perftest.h>")
-
-    parts.append(PREAMBLE)
-
-    for fname,fcode in functions:
-        parts.append(fcode)
-
-    #TODO output TestVariables in <testsuite> somewhere
-
-    parts.append("int main(int argc, char **argv)")
-    parts.append("{")
-    parts.append("PROCESS_ARGUMENTS(argc, argv);")
-    parts.append("BEGIN_TESTSUITE(\"" + pname + "\");")
-    parts.append("RECORD_PLATFORM_INFO();")
-    for fname,fcode in functions:
-        parts.append(fname + "();")
-    parts.append("END_TESTSUITE();")
-    parts.append("}")
-    parts.append("\n")
-
-    return "\n".join(parts)
-
-def generate_program(pname, TestVariables, PREAMBLE, INITIALIZE, TIME, FINALIZE):
-    functions = list(generate_functions(pname, TestVariables, INITIALIZE, TIME, FINALIZE))
-    return make_test_program(pname, functions, PREAMBLE)
-
-
-###
-# Test Input File -> Test Program
-def process_test_file(filename):
-    import os
-    pname = os.path.splitext(os.path.split(filename)[1])[0]
-    
-    test_env_file = os.path.join( os.path.split(__file__)[0], 'test_env.py')
-
-    # XXX why does execfile() not give us the right namespace?
-    exec open(test_env_file)
-    exec open(filename)
-
-    return generate_program(pname, TestVariables, PREAMBLE, INITIALIZE, TIME, FINALIZE)
-
-
-def compile_test(input_name, output_name):
-    """Compiles a .test file into a .cu file"""
-    open(output_name, 'w').write( process_test_file(input_name) )
-
-
-
-##
-# Simple Driver script
-if __name__ == '__main__':
-    import os, sys
-
-    if len(sys.argv) not in [2,3]:
-        print "usage: %s test_input.py [test_output.cu]" % (sys.argv[0],)
-        os.exit()
-    
-    input_name = sys.argv[1]
-
-    if len(sys.argv) == 2:
-        # reduce.test -> reduce.cu
-        output_name = os.path.splitext(os.path.split(filename)[1])[0] + '.cu'
-    else:
-        output_name = sys.argv[2]
-        
-    # process_test_file returns a string containing 
-    # the whole test program (i.e. the text of a .cu file)
-    compile_test(input_name, output_name)
-
-    # this is just for show, scons integration would do this differently
-    #import subprocess
-    #subprocess.call('scons')
-    #subprocess.call('./' + pname)
-    #print "collecting data..."
-    #output = subprocess.Popen(['./' + pname], stdout=subprocess.PIPE).communicate()[0]
-    #print output
-
-
diff --git a/performance/build/report.py b/performance/build/report.py
deleted file mode 100644
index 531967be8..000000000
--- a/performance/build/report.py
+++ /dev/null
@@ -1,131 +0,0 @@
-from build import parse_testsuite_xml
-
-__all__ = ['plot_results','print_results']
-
-#TODO add print_results which outputs a CSV file
-
-def full_label(name):
-    known_labels = {'Throughput' : 'Throughput (GOp/s)',
-                    'Sorting'    : 'Sorting Rate (MKey/s)',
-                    'Bandwidth'  : 'Memory Bandwidth (GByte/s)',
-                    'InputSize'  : 'Input Size',
-                    'KeyType'    : 'Key Type' }
-
-    if name in known_labels:
-        return known_labels[name]
-    else:
-        return name
-
-def print_results(input_file, series_key, x_axis, y_axis, title=None, format=None, **kwargs):
-    """Plot performance data stored in an XML file
-
-    if format is None then the figure is shown, otherwise it is 
-    written to a file with the specified extension
-
-    Example
-    -------
-    input_file = 'reduce.xml'
-    series_key = 'InputType'
-    x_axis = 'InputSize'
-    y_axis = 'Throughput'
-    format = 'pdf'
-    """
-
-    try:
-        fid = open(input_file)
-    except IOError:
-        print "unable to open file '%s'" % input_file
-        return
-
-    TS = parse_testsuite_xml(fid)
-    
-    series_titles = set([test.variables[series_key] for (testname,test) in TS.tests.items()])
-    series = dict( zip(series_titles, [list() for s_title in series_titles]) )
-    
-    for testname,test in TS.tests.items():
-        if x_axis in test.variables and y_axis in test.results:
-            series[test.variables[series_key]].append( (test.variables[x_axis], test.results[y_axis]) )
-    
-    
-    print 'title,' + str(title)
-    print 'x_axis_label,' + full_label(x_axis)
-    print 'y_axis_label,' + full_label(y_axis)
-    
-    x_axis = set()
-    for series_title,series_data in series.items():
-        x_axis.update([t[0] for t in series_data])
-    x_axis = sorted(x_axis)
-        
-    print ','.join( ['x_axis'] + [str(v) for v in x_axis])
-
-    for series_title,series_data in series.items():
-        series_data = dict(series_data)
-
-        y_values = []
-        for x_value in x_axis:
-            if x_value in series_data:
-                y_values.append(str(series_data[x_value]))
-            else:
-                y_values.append('')
-
-        print ','.join( [series_title] + [str(v) for v in y_values])
-
-
-def plot_results(input_file, series_key, x_axis, y_axis, plot='loglog', dpi=72, title=None, format=None):
-    """Plot performance data stored in an XML file
-
-    if format is None then the figure is shown, otherwise it is 
-    written to a file with the specified extension
-
-    Example
-    -------
-    input_file = 'reduce.xml'
-    series_key = 'InputType'
-    x_axis = 'InputSize'
-    y_axis = 'Throughput'
-    format = 'pdf'
-    """
-
-    try:
-        fid = open(input_file)
-    except IOError:
-        print "unable to open file '%s'" % input_file
-        return
-
-    TS = parse_testsuite_xml(fid)
-    
-    series_titles = set([test.variables[series_key] for (testname,test) in TS.tests.items()])
-    series = dict( zip(series_titles, [list() for s_title in series_titles]) )
-    
-    for testname,test in TS.tests.items():
-        if x_axis in test.variables and y_axis in test.results:
-            series[test.variables[series_key]].append( (test.variables[x_axis], test.results[y_axis]) )
-    
-
-    if title is None:
-        title = TS.name
-
-    import pylab
-    
-    pylab.figure()
-    pylab.title(title)
-    pylab.xlabel(full_label(x_axis))
-    pylab.ylabel(full_label(y_axis))
-
-    plotter = getattr(pylab, plot) 
-    for series_title,series_data in series.items():
-        series_data.sort()
-        x_values = [val[0] for val in series_data]
-        y_values = [val[1] for val in series_data]
-   
-        plotter(x_values, y_values, label=series_title)
-
-    if len(series) >= 2:
-        pylab.legend(loc=0)
-   
-    if format is None:
-        pylab.show()    
-    else:
-        import os
-        fname = os.path.splitext(input_file)[0] + '.' + format
-        pylab.savefig(fname, dpi=dpi)
diff --git a/performance/build/test_env.py b/performance/build/test_env.py
deleted file mode 100644
index 6cba1ed93..000000000
--- a/performance/build/test_env.py
+++ /dev/null
@@ -1,13 +0,0 @@
-StandardTypes = ['char', 'unsigned char', 'short', 'unsigned short', 'int', 'unsigned int', 'long', 'unsigned long', 'float']
-SignedIntegerTypes = ['char', 'short', 'int', 'long']
-FloatingPointTypes = ['float','double']
-
-StandardSizes = [2**k for k in range(4, 24)]
-
-TestVariables = []
-
-PREAMBLE = ""
-INITIALIZE = ""
-TIME = ""
-FINALIZE = ""
-
diff --git a/performance/build/test_function_template.cxx b/performance/build/test_function_template.cxx
deleted file mode 100644
index d86668bfb..000000000
--- a/performance/build/test_function_template.cxx
+++ /dev/null
@@ -1,83 +0,0 @@
-void $FUNCTION(void)
-{
-    BEGIN_TEST(__FUNCTION__);
-
-    $DESCRIPTION
-
-    try {
-    /************ BEGIN INITIALIZATION SECTION ************/
-    $INITIALIZE
-    /************* END INITIALIZATION SECTION *************/
-    
-    
-        double warmup_time;
-        {
-          timer t;
-    /************ BEGIN TIMING SECTION ************/
-    $TIME
-    /************* END TIMING SECTION *************/
-          warmup_time = t.elapsed();
-        }
-    
-        // only verbose
-        //std::cout << "warmup_time: " << warmup_time << " seconds" << std::endl;
-    
-        static const size_t NUM_TRIALS = 5;
-        static const size_t MAX_ITERATIONS = 1000;
-        static const double MAX_TEST_TIME = 0.5;  //TODO allow to be set by user
-    
-        size_t NUM_ITERATIONS;
-        if (warmup_time == 0)
-            NUM_ITERATIONS = MAX_ITERATIONS;
-        else
-            NUM_ITERATIONS = std::min(MAX_ITERATIONS, std::max( (size_t) 1, (size_t) (MAX_TEST_TIME / warmup_time)));
-    
-        double trial_times[NUM_TRIALS];
-    
-        for(size_t trial = 0; trial < NUM_TRIALS; trial++)
-        {
-            timer t;
-            for(size_t i = 0; i < NUM_ITERATIONS; i++){
-                 
-    /************ BEGIN TIMING SECTION ************/
-    $TIME
-    /************* END TIMING SECTION *************/
-    
-            }
-    
-            trial_times[trial] = t.elapsed() / double(NUM_ITERATIONS);
-        }
-    
-        // only verbose
-        //for(size_t trial = 0; trial < NUM_TRIALS; trial++){
-        //    std::cout << "trial[" << trial << "]  : " << trial_times[trial] << " seconds\n";
-        //}
-    
-        double best_time = *std::min_element(trial_times, trial_times + NUM_TRIALS);
-    
-    /************ BEGIN FINALIZE SECTION ************/
-    $FINALIZE
-    /************* END FINALIZE SECTION *************/
-    
-#if THRUST_DEVICE_SYSTEM==THRUST_DEVICE_SYSTEM_CUDA
-        cudaError_t error = cudaGetLastError();
-        if(error){
-            RECORD_TEST_FAILURE(cudaGetErrorString(error));
-        } else {
-            RECORD_TEST_SUCCESS();
-        }
-#else
-        RECORD_TEST_SUCCESS();
-#endif
-
-    }  // end try
-    catch (std::bad_alloc) {
-        RECORD_TEST_FAILURE("std::bad_alloc");
-    }
-    catch (unittest::UnitTestException e) {
-        RECORD_TEST_FAILURE(e);
-    }
-
-
-    END_TEST();
-}
diff --git a/performance/build/test_program_template.cxx b/performance/build/test_program_template.cxx
deleted file mode 100644
index 3b256b768..000000000
--- a/performance/build/test_program_template.cxx
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <unittest/unittest.h>
-
-/*********** BEGIN PREAMBLE SECTION ***********/
-$PREAMBLE
-/************ END PREAMBLE SECTION ************/
-
-/*********** BEGIN FUNCTIONS SECTION ***********/
-$FUNCTIONS
-/************ END FUNCTIONS SECTION ************/
-
-int main(void)
-{
-//TODO process basic arguments
-
-/*********** BEGIN FUNCTIONCALLS SECTION ***********/
-$FUNCTIONCALLS
-/************ END FUNCTIONCALLS SECTION ************/
-
-}
diff --git a/performance/build/testsuite.py b/performance/build/testsuite.py
deleted file mode 100644
index 8710f1013..000000000
--- a/performance/build/testsuite.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""functions that generate reports and figures using the .xml output from the performance tests"""
-
-__all__ = ['TestSuite', 'parse_testsuite_xml']
-
-class TestSuite:
-    def __init__(self, name, platform, tests):
-        self.name = name
-        self.platform = platform
-        self.tests = tests
-
-    def __repr__(self):
-        import pprint
-        return 'TestSuite' + pprint.pformat( (self.name, self.platform, self.tests) ) 
-
-class Test:
-    def __init__(self, name, variables, results):
-        self.name = name
-        self.variables = variables
-        self.results = results
-
-    def __repr__(self):
-        return 'Test' + repr( (self.name, self.variables, self.results) )
-
-def scalar_element(element):
-    value = element.get('value')
-
-    try:
-        return int(value)
-    except:
-        try:
-            return float(value)
-        except:
-            return value
-
-def parse_testsuite_platform(et):
-    testsuite_platform = {}
-
-    platform_element = et.find('platform')
-    device_element = platform_element.find('device')
-
-    device = {}
-    device['name'] = device_element.get('name')
-    for property_element in device_element.findall('property'):
-        device[property_element.get('name')] = scalar_element(property_element)
-
-    testsuite_platform['device'] = device
-
-    return testsuite_platform
-
-def parse_testsuite_tests(et):
-    testsuite_tests = {}
-
-    for test_element in et.findall('test'):
-        # test name
-        test_name = test_element.get('name')
-
-        # test variables: name -> value
-        test_variables = {}
-        for variable_element in test_element.findall('variable'):
-            test_variables[variable_element.get('name')] = scalar_element(variable_element)
-
-        # test results: name -> (value, units)
-        test_results = {}
-        for result_element in test_element.findall('result'):
-            # TODO make this a thing that can be converted to its first element when treated like a number
-            test_results[result_element.get('name')] = scalar_element(result_element)
-        
-        testsuite_tests[test_name] = Test(test_name, test_variables, test_results)
-
-    return testsuite_tests
-
-def parse_testsuite_xml(filename):
-    import xml.etree.ElementTree as ET
-
-    et = ET.parse(filename)
-    
-    testsuite_name = et.getroot().get('name')
-    testsuite_platform = parse_testsuite_platform(et)
-    testsuite_tests = parse_testsuite_tests(et)
-    
-    return TestSuite(testsuite_name, testsuite_platform, testsuite_tests)
-
-
diff --git a/performance/build/timer.h b/performance/build/timer.h
deleted file mode 100644
index 7690ff765..000000000
--- a/performance/build/timer.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *  Copyright 2008-2009 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-// A simple timer class
-
-#ifdef __CUDACC__
-
-// use CUDA's high-resolution timers when possible
-#include <cuda_runtime_api.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-#include <string>
-
-void cuda_safe_call(cudaError_t error, const std::string& message = "")
-{
-  if(error)
-    throw thrust::system_error(error, thrust::cuda_category(), message);
-}
-
-struct timer
-{
-  cudaEvent_t start;
-  cudaEvent_t end;
-
-  timer(void)
-  {
-    cuda_safe_call(cudaEventCreate(&start));
-    cuda_safe_call(cudaEventCreate(&end));
-    restart();
-  }
-
-  ~timer(void)
-  {
-    cuda_safe_call(cudaEventDestroy(start));
-    cuda_safe_call(cudaEventDestroy(end));
-  }
-
-  void restart(void)
-  {
-    cuda_safe_call(cudaEventRecord(start, 0));
-  }
-
-  double elapsed(void)
-  {
-    cuda_safe_call(cudaEventRecord(end, 0));
-    cuda_safe_call(cudaEventSynchronize(end));
-
-    float ms_elapsed;
-    cuda_safe_call(cudaEventElapsedTime(&ms_elapsed, start, end));
-    return ms_elapsed / 1e3;
-  }
-
-  double epsilon(void)
-  {
-    return 0.5e-6;
-  }
-};
-
-#elif defined(__linux__)
-
-#include <sys/time.h>
-
-struct timer
-{
-  timeval start;
-  timeval end;
-
-  timer(void)
-  {
-    restart();
-  }
-
-  ~timer(void)
-  {
-  }
-
-  void restart(void)
-  {
-    gettimeofday(&start, NULL);
-  }
-
-  double elapsed(void)
-  {
-    gettimeofday(&end, NULL);
-
-    return static_cast<double>(end.tv_sec - start.tv_sec) + 1e-6 * static_cast<double>((int)end.tv_usec - (int)start.tv_usec);
-  }
-
-  double epsilon(void)
-  {
-    return 0.5e-6;
-  }
-};
-
-#else
-
-// fallback to clock()
-#include <ctime>
-
-struct timer
-{
-  clock_t start;
-  clock_t end;
-
-  timer(void)
-  {
-    restart();
-  }
-
-  ~timer(void)
-  {
-  }
-
-  void restart(void)
-  {
-    start = clock();
-  }
-
-  double elapsed(void)
-  {
-    end = clock();
-
-    return static_cast<double>(end - start) / static_cast<double>(CLOCKS_PER_SEC);
-  }
-
-  double epsilon(void)
-  {
-    return 1.0 / static_cast<double>(CLOCKS_PER_SEC);
-  }
-};
-
-#endif
-
diff --git a/performance/comparison_sort_by_key.test b/performance/comparison_sort_by_key.test
deleted file mode 100644
index 6c07f570a..000000000
--- a/performance/comparison_sort_by_key.test
+++ /dev/null
@@ -1,54 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    #include <thrust/sequence.h>
-
-    template<typename T>
-    struct my_less
-    {
-      __host__ __device__
-      bool operator()(const T &x, const T& y) const
-      {
-        return x < y;
-      }
-    };
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-
-    thrust::host_vector<$ValueType>   h_values($InputSize);
-    thrust::device_vector<$ValueType> d_values($InputSize);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-
-    // test sort
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), my_less<$KeyType>());
-
-    ASSERT_EQUAL(d_keys,   h_keys);
-    ASSERT_EQUAL(d_values, h_values);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), my_less<$KeyType>());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-KeyTypes = ['char', 'short', 'int', 'long long', 'float', 'double']
-ValueTypes = ['unsigned int']
-InputSizes = StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/copy_if.test b/performance/copy_if.test
deleted file mode 100644
index 86e54baf4..000000000
--- a/performance/copy_if.test
+++ /dev/null
@@ -1,50 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/copy.h>
-    #include <thrust/device_vector.h>
-    #include <thrust/host_vector.h>
-    #include <unittest/unittest.h>
-    #include <thrust/sequence.h>
-
-    struct pred
-    {
-      __host__ __device__
-      bool operator()(int x) { return bool(x); }
-    };
-
-    """
-
-INITIALIZE = \
-    """
-
-    thrust::host_vector<int> h_input($InputSize); thrust::sequence(h_input.begin(), h_input.end());
-    thrust::host_vector<int> h_stencil = unittest::random_integers<bool>($InputSize);
-    thrust::host_vector<int> h_output($InputSize, -1);
-
-    thrust::device_vector<int> d_input   = h_input;
-    thrust::device_vector<int> d_stencil = h_stencil;
-    thrust::device_vector<int> d_output  = h_output;
-
-    size_t h_count = thrust::copy_if(h_input.begin(), h_input.end(), h_stencil.begin(), h_output.begin(), pred()) - h_output.begin();
-    size_t d_count = thrust::copy_if(d_input.begin(), d_input.end(), d_stencil.begin(), d_output.begin(), pred()) - d_output.begin();
-
-    ASSERT_EQUAL(h_output, d_output);
-    ASSERT_EQUAL(h_count, d_count);
-    """
-
-TIME = \
-    """
-    thrust::copy_if(d_input.begin(), d_input.end(), d_stencil.begin(), d_output.begin(), pred());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH((2*sizeof(int) + 2*sizeof(float)) *  double($InputSize));
-    """
-
-InputSizes = [2**N for N in range(20, 27)]
-
-TestVariables = [('InputSize', InputSizes)]
-
diff --git a/performance/fill.test b/performance/fill.test
deleted file mode 100644
index bfac6dc5c..000000000
--- a/performance/fill.test
+++ /dev/null
@@ -1,33 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/fill.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input($InputSize);
-    thrust::device_vector<$InputType> d_input($InputSize);
-
-    thrust::fill(h_input.begin(),  h_input.end(),  $InputType(13));
-    thrust::fill(d_input.begin(),  d_input.end(),  $InputType(13));
-
-    ASSERT_EQUAL(h_input, d_input);
-    """
-
-TIME = \
-    """
-    thrust::fill(d_input.begin(),  d_input.end(),  $InputType(13));
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/fill_optimization.test b/performance/fill_optimization.test
deleted file mode 100644
index 3b03fad9e..000000000
--- a/performance/fill_optimization.test
+++ /dev/null
@@ -1,51 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/fill.h>
-    #include <thrust/generate.h>
-
-    template <typename T>
-    struct constant_functor
-    {
-        T x;
-
-        constant_functor(T x) : x(x) {}
-        __host__ __device__
-        T operator()(void) const {return x;}
-    };
-
-    template <typename Iterator, typename T>
-    void generate_fill(Iterator first, Iterator last, T x)
-    {
-        thrust::generate(first, last, constant_functor<T>(x));
-    }
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input($InputSize);
-    thrust::device_vector<$InputType> d_input($InputSize);
-
-    thrust::fill(h_input.begin(),  h_input.end(),  $InputType(13));
-    $Method(d_input.begin(),  d_input.end(),  $InputType(13));
-
-    ASSERT_EQUAL(h_input, d_input);
-    """
-
-TIME = \
-    """
-    $Method(d_input.begin(),  d_input.end(),  $InputType(13));
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['char', 'short', 'int', 'long']
-InputSizes = [2**24]
-Methods    = ['thrust::fill', 'generate_fill']
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes), ('Method', Methods)]
-
diff --git a/performance/find.test b/performance/find.test
deleted file mode 100644
index 16bac8da1..000000000
--- a/performance/find.test
+++ /dev/null
@@ -1,62 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/find.h>
-    #include <thrust/reduce.h>
-    #include <thrust/extrema.h>
-
-    template <typename Vector>
-    void find_partial(const Vector& v)
-    {
-        thrust::find(v.begin(),  v.end(), 1);
-    }
-    
-    template <typename Vector>
-    void find_full(const Vector& v)
-    {
-        thrust::max_element(v.begin(), v.end());
-    }
-    
-    template <typename Vector>
-    void reduce_full(const Vector& v)
-    {
-        thrust::max_element(v.begin(), v.end());
-    }
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input($InputSize, 0);
-    thrust::device_vector<$InputType> d_input($InputSize, 0);
-
-    size_t pos = $Fraction * $InputSize;
-
-    if (pos < $InputSize)
-    {
-        h_input[pos] = 1;
-        d_input[pos] = 1;
-    }
-
-    size_t h_index = thrust::find(h_input.begin(),  h_input.end(), 1) - h_input.begin();
-    size_t d_index = thrust::find(d_input.begin(),  d_input.end(), 1) - d_input.begin();
-
-    ASSERT_EQUAL(h_index, d_index);
-    """
-
-TIME = \
-    """
-    $Method(d_input);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['int']
-InputSizes = [2**23]
-Fractions  = [0.01, 0.99]
-Methods    = ['find_partial', 'find_full', 'reduce_full']
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes), ('Fraction', Fractions), ('Method', Methods)]
-
diff --git a/performance/float3_optimization.test b/performance/float3_optimization.test
deleted file mode 100644
index 2dd23ef64..000000000
--- a/performance/float3_optimization.test
+++ /dev/null
@@ -1,104 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/transform.h>
-    #include <thrust/iterator/zip_iterator.h>
-    
-    #include <cmath>
-
-    template <typename T>
-    struct rotate_tuple
-    {
-        template <typename Tuple>
-        __host__ __device__
-        Tuple operator()(const Tuple& t) const
-        {
-            T x = thrust::get<0>(t);
-            T y = thrust::get<1>(t);
-            T z = thrust::get<2>(t);
-
-            T rx = 0.36f*x +  0.48f*y + -0.80f*z;
-            T ry =-0.80f*x +  0.60f*y +  0.00f*z;
-            T rz = 0.48f*x +  0.64f*y +  0.60f*z;
-
-            return Tuple(rx, ry, rz);
-        }
-    };
-    
-    struct rotate_float3
-    {
-        __host__ __device__
-        float3 operator()(const float3& t) const
-        {
-            float x = t.x;
-            float y = t.y;
-            float z = t.z;
-
-            float3 rt;
-
-            rt.x = 0.36f*x +  0.48f*y + -0.80f*z;
-            rt.y =-0.80f*x +  0.60f*y +  0.00f*z;
-            rt.z = 0.48f*x +  0.64f*y +  0.60f*z;
-
-            return rt;
-        }
-    };
-    
-    template <typename Vector, typename Vector3>
-    void rotate_fast(Vector& x, Vector& y, Vector& z, Vector3& v)
-    {
-        typedef typename Vector::value_type T;
-
-        size_t N = x.size();
-        
-        thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(x.begin(), y.begin(), z.begin())),
-                          thrust::make_zip_iterator(thrust::make_tuple(x.begin(), y.begin(), z.begin())) + N,
-                          thrust::make_zip_iterator(thrust::make_tuple(x.begin(), y.begin(), z.begin())),
-                          rotate_tuple<T>());
-    }
-    
-    template <typename Vector, typename Vector3>
-    void rotate_slow(Vector& x, Vector& y, Vector& z, Vector3& v)
-    {
-        thrust::transform(v.begin(), v.end(), v.begin(), rotate_float3());
-    }
-
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_x = unittest::random_samples<$InputType>($InputSize);
-    thrust::host_vector<$InputType>   h_y = unittest::random_samples<$InputType>($InputSize);
-    thrust::host_vector<$InputType>   h_z = unittest::random_samples<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_x = h_x;
-    thrust::device_vector<$InputType> d_y = h_y;
-    thrust::device_vector<$InputType> d_z = h_z;
-    
-    thrust::host_vector<float3>   h_v($InputSize, make_float3(1.0,0.4,0.2));
-    thrust::device_vector<float3> d_v = h_v;
-
-    $Method(h_x, h_y, h_z, h_v);
-    $Method(d_x, d_y, d_z, d_v);
-
-    ASSERT_ALMOST_EQUAL(h_x, d_x);
-    ASSERT_ALMOST_EQUAL(h_y, d_y);
-    ASSERT_ALMOST_EQUAL(h_z, d_z);
-    """
-
-TIME = \
-    """
-    $Method(d_x, d_y, d_z, d_v);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(2*9*double($InputSize));
-    RECORD_BANDWIDTH(2*3*sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['float']
-InputSizes = [2**24]
-Methods    = ['rotate_fast','rotate_slow']
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes), ('Method', Methods)]
-
diff --git a/performance/gather.test b/performance/gather.test
deleted file mode 100644
index 9e47aa5d4..000000000
--- a/performance/gather.test
+++ /dev/null
@@ -1,43 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/gather.h>
-    #include <thrust/iterator/counting_iterator.h>
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<int>          h_map(thrust::make_counting_iterator(0),
-                                            thrust::make_counting_iterator($InputSize));
-    std::random_shuffle(h_map.begin(), h_map.end());
-    thrust::host_vector<$InputType>   h_result($InputSize);
-
-    thrust::device_vector<$InputType> d_input = h_input;
-    thrust::device_vector<int>        d_map = h_map;
-    thrust::device_vector<$InputType> d_result($InputSize);
-
-    thrust::gather(h_map.begin(), h_map.end(), h_input.begin(), h_result.begin());
-    thrust::gather(d_map.begin(), d_map.end(), d_input.begin(), d_result.begin());
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::gather(d_map.begin(), d_map.end(), d_input.begin(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
-
diff --git a/performance/host_sort.test b/performance/host_sort.test
deleted file mode 100644
index 9faf5f923..000000000
--- a/performance/host_sort.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType> h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::host_vector<$KeyType> h_keys_copy(h_keys);
-    
-    // test sort
-    $Sort(h_keys.begin(), h_keys.end());
-
-    ASSERT_EQUAL(thrust::is_sorted(h_keys.begin(), h_keys.end()), true);
-    """
-
-TIME = \
-    """
-    thrust::copy(h_keys_copy.begin(), h_keys_copy.end(), h_keys.begin());
-    $Sort(h_keys.begin(), h_keys.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes   = ['int']
-InputSizes = [2**20]
-Sorts      = ['thrust::sort', 'thrust::stable_sort', 'std::sort', 'std::stable_sort']
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes), ('Sort', Sorts)]
-
diff --git a/performance/host_sort_by_key.test b/performance/host_sort_by_key.test
deleted file mode 100644
index cdd4fd135..000000000
--- a/performance/host_sort_by_key.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType> h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::host_vector<$KeyType> h_keys_copy(h_keys);
-    thrust::host_vector<$KeyType> h_values($InputSize);
-    
-    // test sort
-    $Sort(h_keys.begin(), h_keys.end(), h_values.begin());
-
-    ASSERT_EQUAL(thrust::is_sorted(h_keys.begin(), h_keys.end()), true);
-    """
-
-TIME = \
-    """
-    thrust::copy(h_keys_copy.begin(), h_keys_copy.end(), h_keys.begin());
-    $Sort(h_keys.begin(), h_keys.end(), h_values.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes   = ['int']
-InputSizes = [2**20]
-Sorts      = ['thrust::sort_by_key', 'thrust::stable_sort_by_key']
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes), ('Sort', Sorts)]
-
diff --git a/performance/inclusive_scan.test b/performance/inclusive_scan.test
deleted file mode 100644
index c4d2c53f9..000000000
--- a/performance/inclusive_scan.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/scan.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_integers<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_input = h_input;
-    
-    thrust::host_vector<$InputType>   h_output($InputSize);
-    thrust::device_vector<$InputType> d_output($InputSize);
-
-    thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
-    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
-
-    ASSERT_EQUAL(h_output, d_output);
-    """
-
-TIME = \
-    """
-    thrust::inclusive_scan(d_input.begin(), d_input.end(), d_output.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(4*sizeof($InputType)*double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = [2**24] #StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/inclusive_scan_by_key.test b/performance/inclusive_scan_by_key.test
deleted file mode 100644
index 8843d5e0c..000000000
--- a/performance/inclusive_scan_by_key.test
+++ /dev/null
@@ -1,47 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/scan.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$ValueType>   h_values = unittest::random_integers<$ValueType>($InputSize);
-    thrust::device_vector<$ValueType> d_values = h_values;
-    
-    thrust::host_vector<$ValueType>   h_output($InputSize);
-    thrust::device_vector<$ValueType> d_output($InputSize);
-    
-    srand(13);
-    thrust::host_vector<$KeyType> h_keys($InputSize);
-    for(size_t i = 0, k = 0; i < $InputSize; i++)
-    {
-        h_keys[i] = k;
-        if (rand() % 50 == 0)
-            k++;
-    }
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin());
-                                                            
-    ASSERT_EQUAL(h_output, d_output);                       
-    """                                                     
-                                                            
-TIME = \
-    """                                                     
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_output.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(4*(sizeof($KeyType) + sizeof($ValueType))*double($InputSize));
-    """
-
-KeyTypes   = ['int'] #SignedIntegerTypes
-ValueTypes = SignedIntegerTypes
-InputSizes = [2**24] #StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/indirect_sort.test b/performance/indirect_sort.test
deleted file mode 100644
index e0fc508e3..000000000
--- a/performance/indirect_sort.test
+++ /dev/null
@@ -1,87 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-
-    template <typename RandomAccessIterator, typename StrictWeakOrdering> 
-    struct indirect_comp
-    {
-        RandomAccessIterator first;
-        StrictWeakOrdering   comp;
-    
-        indirect_comp(RandomAccessIterator first, StrictWeakOrdering comp)
-            : first(first), comp(comp) {}
-    
-        template <typename IndexType>
-        __host__ __device__
-        bool operator()(IndexType a, IndexType b)
-        {
-            return comp(thrust::raw_reference_cast(first[a]), thrust::raw_reference_cast(first[b]));
-        }    
-    };
-    
-    
-    template <typename RandomAccessIterator, typename StrictWeakOrdering>
-    void indirect_sort(RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       StrictWeakOrdering comp)
-    {
-        typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type T;
-        
-        // todo initialize vector in one step
-        thrust::device_vector<unsigned int> permutation(last - first);
-        thrust::sequence(permutation.begin(), permutation.end());  
-        
-        thrust::stable_sort(permutation.begin(), permutation.end(),
-                            indirect_comp<RandomAccessIterator,StrictWeakOrdering>(first, comp));
-    
-        thrust::device_vector<T> temp(first, last);
-    
-        thrust::gather(permutation.begin(), permutation.end(), temp.begin(), first);
-    }
-    """
-
-INITIALIZE = \
-    """
-    typedef FixedVector<int,$VectorLength> KeyType;
-
-    const size_t N = $InputSize / sizeof(KeyType);
-
-    thrust::host_vector<KeyType>   h_keys(N);
-        
-    for(size_t i = 0; i < h_keys.size(); i++)
-        h_keys[i] = KeyType(rand());
-    
-    thrust::device_vector<KeyType> d_keys      = h_keys;
-    thrust::device_vector<KeyType> d_keys_copy = d_keys;
-   
-    thrust::less<KeyType> comp;
-
-    // test sort
-    thrust::stable_sort(h_keys.begin(), h_keys.end());
-    $Sort(d_keys.begin(), d_keys.end(), comp);
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    $Sort(d_keys.begin(), d_keys.end(), comp);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-VectorLengths = [2**N for N in range(1,14)]
-Sorts         = ['indirect_sort']
-
-#VectorLengths = range(1,9)
-#Sorts         = ['indirect_sort', 'thrust::stable_sort']
-
-InputSizes    = [2**24]
-
-TestVariables = [('VectorLength', VectorLengths), ('Sort', Sorts), ('InputSize', InputSizes)]
-
diff --git a/performance/inner_product.test b/performance/inner_product.test
deleted file mode 100644
index e043ce60c..000000000
--- a/performance/inner_product.test
+++ /dev/null
@@ -1,37 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/inner_product.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input1 = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<$InputType>   h_input2 = unittest::random_integers<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_input1 = h_input1;
-    thrust::device_vector<$InputType> d_input2 = h_input2;
-
-    $InputType init = 13;
-
-    $InputType h_result = thrust::inner_product(h_input1.begin(), h_input1.end(), h_input2.begin(), init);
-    $InputType d_result = thrust::inner_product(d_input1.begin(), d_input1.end(), d_input2.begin(), init);
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::inner_product(d_input1.begin(), d_input1.end(), d_input2.begin(), init);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(2 * double($InputSize));
-    RECORD_BANDWIDTH(2 * sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
-
diff --git a/performance/merge.test b/performance/merge.test
deleted file mode 100644
index 1e158ec4e..000000000
--- a/performance/merge.test
+++ /dev/null
@@ -1,42 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/merge.h>
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::device_vector<$InputType> d_a = unittest::random_integers<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_b = unittest::random_integers<$InputType>($InputSize);
-    thrust::sort(d_a.begin(), d_a.end());
-    thrust::sort(d_b.begin(), d_b.end());
-
-    thrust::device_vector<$InputType> d_sorted;
-    d_sorted.insert(d_sorted.end(), d_a.begin(), d_a.end());
-    d_sorted.insert(d_sorted.end(), d_b.begin(), d_b.end());
-    thrust::stable_sort(d_sorted.begin(), d_sorted.end());
-
-    thrust::device_vector<$InputType> d_result(d_a.size() + d_b.size());
-    thrust::merge(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-
-    ASSERT_EQUAL(d_sorted, d_result);
-    """
-
-TIME = \
-    """
-    thrust::merge(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH(4 * sizeof($InputType) * double($InputSize));
-    RECORD_SORTING_RATE(2 * double($InputSize))
-    """
-
-
-InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/merge_sort.test b/performance/merge_sort.test
deleted file mode 100644
index b879f5ffb..000000000
--- a/performance/merge_sort.test
+++ /dev/null
@@ -1,46 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-
-    template<typename T>
-      struct my_less
-    {
-      __host__ __device__
-      bool operator()(const T &x, const T &y) const
-      {
-        return x < y;
-      }
-    };
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-    
-    // test sort
-    thrust::stable_sort(h_keys.begin(), h_keys.end());
-    thrust::stable_sort(d_keys.begin(), d_keys.end(), my_less<$KeyType>());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::stable_sort(d_keys.begin(), d_keys.end(), my_less<$KeyType>());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes   = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(18, 25)]
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/min_index.test b/performance/min_index.test
deleted file mode 100644
index 11dd32912..000000000
--- a/performance/min_index.test
+++ /dev/null
@@ -1,77 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/reduce.h>
-    #include <thrust/sequence.h>
-    #include <thrust/iterator/counting_iterator.h>
-    #include <thrust/iterator/zip_iterator.h>
-
-    using namespace thrust;
-
-    struct smaller_tuple
-    {
-      __host__ __device__
-      tuple<float,int> operator()(tuple<float,int> a, tuple<float,int> b)
-      {
-        if (a < b)
-          return a;
-        else
-          return b;
-      }
-    };
-    
-    int min_index_slow(device_vector<float>& values)
-    {
-      device_vector<int> indices(values.size());
-      sequence(indices.begin(), indices.end());
-
-      tuple<float,int> init(values[0],0);
-    
-      tuple<float,int> smallest = reduce(make_zip_iterator(make_tuple(values.begin(), indices.begin())),
-                                         make_zip_iterator(make_tuple(values.end(),   indices.end())),
-                                         init,
-                                         smaller_tuple());
-      return get<1>(smallest);
-    }
-    
-    int min_index_fast(device_vector<float>& values)
-    {
-      counting_iterator<int> begin(0);
-      counting_iterator<int> end(values.size());
-    
-      tuple<float,int> init(values[0],0);
-    
-      tuple<float,int> smallest = reduce(make_zip_iterator(make_tuple(values.begin(), begin)),
-                                         make_zip_iterator(make_tuple(values.end(),     end)),
-                                         init,
-                                         smaller_tuple());
-      return get<1>(smallest);
-    }
-
-
-
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<float>   h_input = unittest::random_integers<float>($InputSize);
-    thrust::device_vector<float> d_input = h_input;
-
-    """
-
-TIME = \
-    """
-    $Function(d_input);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof(float) *  double($InputSize));
-    """
-
-Functions  = ['min_index_slow','min_index_fast']
-InputSizes = [2**22]
-
-TestVariables = [('Function',Functions), ('InputSize', InputSizes)]
-
diff --git a/performance/nrm2.test b/performance/nrm2.test
deleted file mode 100644
index 5640d7934..000000000
--- a/performance/nrm2.test
+++ /dev/null
@@ -1,70 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/transform.h>
-    #include <thrust/reduce.h>
-    #include <thrust/transform_reduce.h>
-    #include <thrust/functional.h>
-    
-    #include <cmath>
-
-    template <typename T>
-    struct square
-    {
-        __host__ __device__
-        T operator()(T x) const
-        {
-            return x * x;
-        }
-    };
-    
-    template <typename Vector>
-    typename Vector::value_type nrm2_fast(const Vector& x)
-    {
-        typedef typename Vector::value_type T;
-        return std::sqrt( thrust::transform_reduce(x.begin(), x.end(), square<T>(), T(0), thrust::plus<T>()) );
-    }
-    
-    template <typename Vector>
-    typename Vector::value_type nrm2_slow(const Vector& x)
-    {
-        typedef typename Vector::value_type T;
-        
-        Vector temp(x.size());
-        
-        // temp <- x * x
-        thrust::transform(x.begin(), x.end(), temp.begin(), square<T>());
-
-        return std::sqrt( thrust::reduce(temp.begin(), temp.end()) );
-    }
-
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_integers<bool>($InputSize);
-    thrust::device_vector<$InputType> d_input = h_input;
-
-    $InputType h_result = $Method(h_input);
-    $InputType d_result = $Method(d_input);
-
-    ASSERT_EQUAL(std::abs(h_result - d_result) / std::abs(h_result + d_result) < 1e-3, true);
-    """
-
-TIME = \
-    """
-    $Method(d_input);
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['float', 'double']
-InputSizes = [2**24]
-Methods    = ['nrm2_fast', 'nrm2_slow']
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes), ('Method', Methods)]
-
diff --git a/performance/radix_sort.test b/performance/radix_sort.test
deleted file mode 100644
index 972707141..000000000
--- a/performance/radix_sort.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-    
-    // test sort
-    thrust::stable_sort(h_keys.begin(), h_keys.end());
-    thrust::stable_sort(d_keys.begin(), d_keys.end());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::stable_sort(d_keys.begin(), d_keys.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes   = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(18, 25)]
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/radix_sort_bits.test b/performance/radix_sort_bits.test
deleted file mode 100644
index 82b6e991a..000000000
--- a/performance/radix_sort_bits.test
+++ /dev/null
@@ -1,42 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    const size_t InputSize = 1 << 24;
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>(InputSize);
-
-    // set upper bits to zero
-    for(size_t i = 0; i < InputSize; i++)
-        h_keys[i] >>= (32 - $KeyBits);
-
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-    
-    // test sort
-    thrust::stable_sort(h_keys.begin(), h_keys.end());
-    thrust::stable_sort(d_keys.begin(), d_keys.end());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::stable_sort(d_keys.begin(), d_keys.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double(InputSize));
-    """
-
-
-KeyTypes = ['unsigned int']
-KeyBits = range(1, 33)
-
-TestVariables = [('KeyType', KeyTypes), ('KeyBits',KeyBits)]
-
diff --git a/performance/radix_sort_by_key.test b/performance/radix_sort_by_key.test
deleted file mode 100644
index ba8f8646d..000000000
--- a/performance/radix_sort_by_key.test
+++ /dev/null
@@ -1,44 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    #include <thrust/sequence.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-
-    thrust::host_vector<$ValueType>   h_values($InputSize);
-    thrust::device_vector<$ValueType> d_values($InputSize);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-
-    // test sort
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
-
-    ASSERT_EQUAL(d_keys,   h_keys);
-    ASSERT_EQUAL(d_values, h_values);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-KeyTypes = ['char', 'short', 'int', 'long long', 'float', 'double']
-ValueTypes = ['unsigned int']
-InputSizes = StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/reduce.test b/performance/reduce.test
deleted file mode 100644
index 6eea3b472..000000000
--- a/performance/reduce.test
+++ /dev/null
@@ -1,34 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/reduce.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_integers<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_input = h_input;
-
-    $InputType init = 13;
-
-    $InputType h_result = thrust::reduce(h_input.begin(), h_input.end(), init);
-    $InputType d_result = thrust::reduce(d_input.begin(), d_input.end(), init);
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::reduce(d_input.begin(), d_input.end(), init);   
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/reduce_by_key.test b/performance/reduce_by_key.test
deleted file mode 100644
index 10aee8091..000000000
--- a/performance/reduce_by_key.test
+++ /dev/null
@@ -1,61 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/reduce.h>
-    #include <thrust/random.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$ValueType>   h_values = unittest::random_integers<$ValueType>($InputSize);
-    thrust::device_vector<$ValueType> d_values = h_values;
-
-    thrust::host_vector<$KeyType>     h_keys_result($InputSize);
-    thrust::host_vector<$ValueType>   h_values_result($InputSize);
-
-    thrust::device_vector<$KeyType>   d_keys_result($InputSize);
-    thrust::device_vector<$ValueType> d_values_result($InputSize);
-
-    thrust::default_random_engine rng(13);
-    thrust::host_vector<$KeyType> h_keys($InputSize);
-    for(size_t i = 0, k = 0; i < $InputSize; i++)
-    {
-      h_keys[i] = k;
-      if(rng() % 50 == 0)
-        k++;
-    }
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-
-    thrust::pair<
-      thrust::host_vector<$KeyType>::iterator,
-      thrust::host_vector<$ValueType>::iterator
-    > h_end = thrust::reduce_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), h_keys_result.begin(), h_values_result.begin());
-    h_keys_result.erase(h_end.first, h_keys_result.end());
-
-    thrust::pair<
-      thrust::device_vector<$KeyType>::iterator,
-      thrust::device_vector<$ValueType>::iterator
-    > d_end = thrust::reduce_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_keys_result.begin(), d_values_result.begin());
-    d_keys_result.erase(d_end.first, d_keys_result.end());
-
-    ASSERT_EQUAL(h_keys_result, d_keys_result);
-    ASSERT_EQUAL(h_values_result, d_values_result);
-    """
-
-TIME = \
-    """
-    thrust::reduce_by_key(d_keys.begin(), d_keys.end(), d_values.begin(), d_keys_result.begin(), d_values_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($KeyType) * double(d_keys.size() + d_keys_result.size()) + sizeof($ValueType) * double(d_values.size() + d_values_result.size()));
-    """
-
-KeyTypes   = ['int'] #SignedIntegerTypes
-ValueTypes = SignedIntegerTypes
-InputSizes = [2**24] #StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes),('InputSize', InputSizes)]
-
diff --git a/performance/reduce_float.test b/performance/reduce_float.test
deleted file mode 100644
index 8dda319a3..000000000
--- a/performance/reduce_float.test
+++ /dev/null
@@ -1,31 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/reduce.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType>   h_input = unittest::random_samples<$InputType>($InputSize);
-    thrust::device_vector<$InputType> d_input = h_input;
-
-    $InputType init = 13;
-
-    """
-
-TIME = \
-    """
-    thrust::reduce(d_input.begin(), d_input.end(), init);   
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    RECORD_BANDWIDTH(sizeof($InputType) *  double($InputSize));
-    """
-
-InputTypes = ['float']
-InputSizes = [int(2**(k/2.0)) for k in range(42,56)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/report.py b/performance/report.py
deleted file mode 100644
index 6024ee33f..000000000
--- a/performance/report.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from build import plot_results, print_results
-
-#valid formats are png, pdf, ps, eps and svg
-#if format=None the plot will be displayed
-format = 'png'
-output = print_results
-#output = plot_results
-
-for function in ['fill', 'reduce', 'inner_product', 'gather', 'merge']:
-    output(function + '.xml', 'InputType', 'InputSize', 'Bandwidth', format=format)
-
-for function in ['inclusive_scan', 'inclusive_segmented_scan', 'unique']:
-    output(function + '.xml', 'InputType', 'InputSize', 'Throughput', format=format)
-
-for method in ['indirect_sort']:
-    output(method + '.xml',    'Sort', 'VectorLength', 'Time', plot='semilogx', title='Indirect Sorting', format=format)
-
-for method in ['sort', 'comparison_sort', 'radix_sort']:
-    output(method + '.xml',    'KeyType', 'InputSize', 'Sorting', title='thrust::' + method, format=format)
-    output(method + '_by_key.xml', 'KeyType', 'InputSize', 'Sorting', title='thrust::' + method + '_by_key', format=format)
-
-for method in ['set_difference', 'set_intersection', 'set_symmetric_difference', 'set_union']:
-  output(method + '.xml', 'InputType', 'InputSize', 'Sorting', title='thrust::' + method, format=format)
-    
-output('stl_sort.xml', 'KeyType', 'InputSize', 'Sorting', title='std::sort', format=format)
-
-for method in ['radix_sort']:
-    output(method + '_bits.xml', 'KeyType', 'KeyBits', 'Sorting', title='thrust::' + method, plot='plot', dpi=72, format=format)
-
-for format in ['png', 'pdf']:
-    output('reduce_float.xml', 'InputType', 'InputSize', 'Bandwidth', dpi=120, plot='semilogx', title='thrust::reduce<float>()', format=format)
-    output('sort_large.xml',  'KeyType', 'InputSize', 'Sorting', dpi=120, plot='semilogx', title='thrust::sort<T>()', format=format)
-
diff --git a/performance/set_difference.test b/performance/set_difference.test
deleted file mode 100644
index fa1521d8e..000000000
--- a/performance/set_difference.test
+++ /dev/null
@@ -1,45 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/set_operations.h>
-    #include <thrust/sort.h>
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
-    thrust::sort(h_a.begin(), h_a.end());
-    thrust::sort(h_b.begin(), h_b.end());
-
-    thrust::host_vector<$InputType> h_result(h_a.size());
-    thrust::host_vector<$InputType>::iterator new_end = 
-      thrust::set_difference(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
-    h_result.resize(new_end - h_result.begin());
-
-    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
-
-    thrust::device_vector<$InputType> d_result(h_result.size());
-    thrust::set_difference(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::set_difference(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
-    RECORD_SORTING_RATE(2 * double($InputSize))
-    """
-
-
-InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/set_intersection.test b/performance/set_intersection.test
deleted file mode 100644
index 2316fc36a..000000000
--- a/performance/set_intersection.test
+++ /dev/null
@@ -1,45 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/set_operations.h>
-    #include <thrust/sort.h>
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
-    thrust::sort(h_a.begin(), h_a.end());
-    thrust::sort(h_b.begin(), h_b.end());
-
-    thrust::host_vector<$InputType> h_result(h_a.size());
-    thrust::host_vector<$InputType>::iterator new_end = 
-      thrust::set_intersection(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
-    h_result.resize(new_end - h_result.begin());
-
-    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
-
-    thrust::device_vector<$InputType> d_result(h_result.size());
-    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::set_intersection(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
-    RECORD_SORTING_RATE(2 * double($InputSize))
-    """
-
-
-InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/set_symmetric_difference.test b/performance/set_symmetric_difference.test
deleted file mode 100644
index 2e08af416..000000000
--- a/performance/set_symmetric_difference.test
+++ /dev/null
@@ -1,45 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/set_operations.h>
-    #include <thrust/sort.h>
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
-    thrust::sort(h_a.begin(), h_a.end());
-    thrust::sort(h_b.begin(), h_b.end());
-
-    thrust::host_vector<$InputType> h_result(h_a.size());
-    thrust::host_vector<$InputType>::iterator new_end = 
-      thrust::set_symmetric_difference(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
-    h_result.resize(new_end - h_result.begin());
-
-    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
-
-    thrust::device_vector<$InputType> d_result(h_result.size());
-    thrust::set_symmetric_difference(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::set_symmetric_difference(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH((2 * double($InputSize) + d_result.size()) * sizeof($InputType));
-    RECORD_SORTING_RATE(2 * double($InputSize))
-    """
-
-
-InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/set_union.test b/performance/set_union.test
deleted file mode 100644
index 51a22b1ad..000000000
--- a/performance/set_union.test
+++ /dev/null
@@ -1,46 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/set_operations.h>
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType> h_a = unittest::random_integers<$InputType>($InputSize);
-    thrust::host_vector<$InputType> h_b = unittest::random_integers<$InputType>($InputSize);
-    thrust::sort(h_a.begin(), h_a.end());
-    thrust::sort(h_b.begin(), h_b.end());
-
-    thrust::host_vector<$InputType> h_result(h_a.size() + h_b.size());
-    thrust::host_vector<$InputType>::iterator h_new_end = 
-      thrust::set_union(h_a.begin(), h_a.end(), h_b.begin(), h_b.end(), h_result.begin());
-    h_result.resize(h_new_end - h_result.begin());
-
-    thrust::device_vector<$InputType> d_a = h_a, d_b = h_b;
-
-    thrust::device_vector<$InputType> d_result(d_a.size() + d_b.size());
-    thrust::device_vector<$InputType>::iterator d_new_end = 
-      thrust::set_union(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    d_result.resize(d_new_end - d_result.begin());
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::set_union(d_a.begin(), d_a.end(), d_b.begin(), d_b.end(), d_result.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_BANDWIDTH(sizeof($InputType) * double(d_a.size() + d_b.size() + d_result.size()));
-    RECORD_SORTING_RATE(2 * double($InputSize))
-    """
-
-
-InputTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/sort.test b/performance/sort.test
deleted file mode 100644
index bcbbfe447..000000000
--- a/performance/sort.test
+++ /dev/null
@@ -1,36 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-    
-    // test sort
-    thrust::sort(h_keys.begin(), h_keys.end());
-    thrust::sort(d_keys.begin(), d_keys.end());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::sort(d_keys.begin(), d_keys.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/sort_by_key.test b/performance/sort_by_key.test
deleted file mode 100644
index a132c5fc8..000000000
--- a/performance/sort_by_key.test
+++ /dev/null
@@ -1,44 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-    #include <thrust/sequence.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-
-    thrust::host_vector<$ValueType>   h_values($InputSize);
-    thrust::device_vector<$ValueType> d_values($InputSize);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-
-    // test sort
-    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
-
-    ASSERT_EQUAL(d_keys,   h_keys);
-    ASSERT_EQUAL(d_values, h_values);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::sort_by_key(d_keys.begin(), d_keys.end(), d_values.begin());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-KeyTypes = ['char', 'short', 'int', 'long long', 'float', 'double']
-ValueTypes = ['unsigned int']
-InputSizes = StandardSizes
-
-TestVariables = [('KeyType', KeyTypes), ('ValueType', ValueTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/sort_large.test b/performance/sort_large.test
deleted file mode 100644
index 4a36d3b71..000000000
--- a/performance/sort_large.test
+++ /dev/null
@@ -1,47 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/sort.h>
-
-    template <typename T>
-    struct my_less : public thrust::binary_function<T,T,bool>
-    {
-        __host__ __device__
-        bool operator()(const T& a, const T& b) const
-        {
-            return a < b;
-        }
-    };
-
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType>   h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::device_vector<$KeyType> d_keys = h_keys;
-    thrust::device_vector<$KeyType> d_keys_copy = d_keys;
-    
-    typedef my_less<$KeyType> Comp;
-    
-    // test sort
-    thrust::sort(h_keys.begin(), h_keys.end(), Comp());
-    thrust::sort(d_keys.begin(), d_keys.end(), Comp());
-
-    ASSERT_EQUAL(d_keys, h_keys);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_keys_copy.begin(), d_keys_copy.end(), d_keys.begin());
-    thrust::sort(d_keys.begin(), d_keys.end(), Comp());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-KeyTypes =  ['int']
-InputSizes = [2**24]
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
diff --git a/performance/stl_sort.test b/performance/stl_sort.test
deleted file mode 100644
index 20b3aa188..000000000
--- a/performance/stl_sort.test
+++ /dev/null
@@ -1,29 +0,0 @@
-PREAMBLE = \
-    """
-    #include <algorithm>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$KeyType> h_keys = unittest::random_integers<$KeyType>($InputSize);
-    thrust::host_vector<$KeyType> h_keys_copy = h_keys;
-    """
-
-TIME = \
-    """
-    std::copy(h_keys_copy.begin(), h_keys_copy.end(), h_keys.begin());
-    std::sort(h_keys.begin(), h_keys.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_SORTING_RATE(double($InputSize));
-    """
-
-
-KeyTypes = ['char', 'short', 'int', 'long', 'float', 'double']
-InputSizes = [2**N for N in range(10, 25)]
-
-TestVariables = [('KeyType', KeyTypes), ('InputSize', InputSizes)]
-
diff --git a/performance/unique.test b/performance/unique.test
deleted file mode 100644
index 99c3aac8a..000000000
--- a/performance/unique.test
+++ /dev/null
@@ -1,42 +0,0 @@
-PREAMBLE = \
-    """
-    #include <thrust/unique.h>
-    """
-
-INITIALIZE = \
-    """
-    thrust::host_vector<$InputType> h_input = unittest::random_integers<$InputType>($InputSize);
-   
-    // increase likelihood of equal consecutive elements
-    for(size_t i = 0; i < $InputSize; i++)
-        h_input[i] %= 4;   
-
-    thrust::device_vector<$InputType> d_input = h_input;
-    thrust::device_vector<$InputType> d_copy = d_input;
-    
-    thrust::host_vector<$InputType>::iterator   h_end = thrust::unique(h_input.begin(), h_input.end());
-    thrust::device_vector<$InputType>::iterator d_end = thrust::unique(d_input.begin(), d_input.end());
-    
-    thrust::host_vector<$InputType>   h_result(h_input.begin(), h_end);
-    thrust::device_vector<$InputType> d_result(d_input.begin(), d_end);
-
-    ASSERT_EQUAL(h_result, d_result);
-    """
-
-TIME = \
-    """
-    thrust::copy(d_copy.begin(), d_copy.end(), d_input.begin());
-    thrust::unique(d_input.begin(), d_input.end());
-    """
-
-FINALIZE = \
-    """
-    RECORD_TIME();
-    RECORD_THROUGHPUT(double($InputSize));
-    """
-
-InputTypes = SignedIntegerTypes
-InputSizes = StandardSizes
-
-TestVariables = [('InputType', InputTypes), ('InputSize', InputSizes)]
-
diff --git a/scripts/gdb-pretty-printers.py b/scripts/gdb-pretty-printers.py
new file mode 100644
index 000000000..15d790411
--- /dev/null
+++ b/scripts/gdb-pretty-printers.py
@@ -0,0 +1,153 @@
+import gdb
+import sys
+
+if sys.version_info[0] > 2:
+    Iterator = object
+else:
+    # "Polyfill" for Python2 Iterator interface
+    class Iterator:
+        def next(self):
+            return self.__next__()
+
+
+class ThrustVectorPrinter(gdb.printing.PrettyPrinter):
+    "Print a thrust::*_vector"
+
+    class _host_accessible_iterator(Iterator):
+        def __init__(self, start, size):
+            self.item = start
+            self.size = size
+            self.count = 0
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            if self.count >= self.size:
+                raise StopIteration
+            elt = self.item.dereference()
+            count = self.count
+            self.item = self.item + 1
+            self.count = self.count + 1
+            return ('[%d]' % count, elt)
+
+    class _device_iterator(Iterator):
+        def __init__(self, start, size):
+            self.exec = exec
+            self.item = start
+            self.size = size
+            self.count = 0
+            self.buffer = None
+            self.sizeof = self.item.dereference().type.sizeof
+            self.buffer_start = 0
+            # At most 1 MB or size, at least 1
+            self.buffer_size = min(size, max(1, 2 ** 20 // self.sizeof))
+            self.buffer = gdb.parse_and_eval(
+                '(void*)malloc(%s)' % (self.buffer_size * self.sizeof))
+            self.buffer.fetch_lazy()
+            self.buffer_count = self.buffer_size
+            self.update_buffer()
+
+        def update_buffer(self):
+            if self.buffer_count >= self.buffer_size:
+                self.buffer_item = gdb.parse_and_eval(
+                    hex(self.buffer)).cast(self.item.type)
+                self.buffer_count = 0
+                self.buffer_start = self.count
+                device_addr = hex(self.item.dereference().address)
+                buffer_addr = hex(self.buffer)
+                size = min(self.buffer_size, self.size -
+                           self.buffer_start) * self.sizeof
+                status = gdb.parse_and_eval(
+                    '(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % (buffer_addr, device_addr, size))
+                if status != 0:
+                    raise gdb.MemoryError(
+                        'memcpy from device failed: %s' % status)
+
+        def __del__(self):
+            gdb.parse_and_eval('(void)free(%s)' %
+                               hex(self.buffer)).fetch_lazy()
+
+        def __iter__(self):
+            return self
+
+        def __next__(self):
+            if self.count >= self.size:
+                raise StopIteration
+            self.update_buffer()
+            elt = self.buffer_item.dereference()
+            self.buffer_item = self.buffer_item + 1
+            self.buffer_count = self.buffer_count + 1
+            count = self.count
+            self.item = self.item + 1
+            self.count = self.count + 1
+            return ('[%d]' % count, elt)
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val['m_storage']['m_begin']['m_iterator']
+        self.size = int(val['m_size'])
+        self.capacity = int(val['m_storage']['m_size'])
+        self.is_device = False
+        if str(self.pointer.type).startswith("thrust::device_ptr"):
+            self.pointer = self.pointer['m_iterator']
+            self.is_device = True
+
+    def children(self):
+        if self.is_device:
+            return self._device_iterator(self.pointer, self.size)
+        else:
+            return self._host_accessible_iterator(self.pointer, self.size)
+
+    def to_string(self):
+        typename = str(self.val.type)
+        return ('%s of length %d, capacity %d' % (typename, self.size, self.capacity))
+
+    def display_hint(self):
+        return 'array'
+
+
+class ThrustReferencePrinter(gdb.printing.PrettyPrinter):
+    "Print a thrust::device_reference"
+
+    def __init__(self, val):
+        self.val = val
+        self.pointer = val['ptr']['m_iterator']
+        self.type = self.pointer.dereference().type
+        sizeof = self.type.sizeof
+        self.buffer = gdb.parse_and_eval('(void*)malloc(%s)' % sizeof)
+        device_addr = hex(self.pointer)
+        buffer_addr = hex(self.buffer)
+        status = gdb.parse_and_eval('(cudaError)cudaMemcpy(%s, %s, %d, cudaMemcpyDeviceToHost)' % (
+            buffer_addr, device_addr, sizeof))
+        if status != 0:
+            raise gdb.MemoryError('memcpy from device failed: %s' % status)
+        self.buffer_val = gdb.parse_and_eval(
+            hex(self.buffer)).cast(self.pointer.type).dereference()
+
+    def __del__(self):
+        gdb.parse_and_eval('(void)free(%s)' % hex(self.buffer)).fetch_lazy()
+
+    def children(self):
+        return []
+
+    def to_string(self):
+        typename = str(self.val.type)
+        return ('(%s) @%s: %s' % (typename, self.pointer, self.buffer_val))
+
+    def display_hint(self):
+        return None
+
+
+def lookup_thrust_type(val):
+    if not str(val.type.unqualified()).startswith('thrust::'):
+        return None
+    suffix = str(val.type.unqualified())[8:]
+    if suffix.startswith('host_vector') or suffix.startswith('device_vector'):
+        return ThrustVectorPrinter(val)
+    elif int(gdb.VERSION.split(".")[0]) >= 10 and suffix.startswith('device_reference'):
+        return ThrustReferencePrinter(val)
+    return None
+
+
+gdb.pretty_printers.append(lookup_thrust_type)
diff --git a/site_scons/site_tools/nvcc.py b/site_scons/site_tools/nvcc.py
deleted file mode 100644
index be0b323e8..000000000
--- a/site_scons/site_tools/nvcc.py
+++ /dev/null
@@ -1,132 +0,0 @@
-"""SCons.Tool.nvcc
-
-Tool-specific initialization for NVIDIA CUDA Compiler.
-
-There normally shouldn't be any need to import this module directly.
-It will usually be imported through the generic SCons.Tool.Tool()
-selection method.
-
-"""
-
-import SCons.Tool
-import SCons.Scanner.C
-import SCons.Defaults
-import os
-import platform
-
-
-def get_cuda_paths():
-  """Determines CUDA {bin,lib,include} paths
-  
-  returns (bin_path,lib_path,inc_path)
-  """
-
-  # determine defaults
-  if os.name == 'nt':
-    bin_path = 'C:/CUDA/bin'
-    lib_path = 'C:/CUDA/lib'
-    inc_path = 'C:/CUDA/include'
-  elif os.name == 'posix':
-    bin_path = '/usr/local/cuda/bin'
-    lib_path = '/usr/local/cuda/lib'
-    inc_path = '/usr/local/cuda/include'
-  else:
-    raise ValueError, 'Error: unknown OS.  Where is nvcc installed?'
-   
-  if platform.machine()[-2:] == '64':
-    lib_path += '64'
-
-  # override with environement variables
-  if 'CUDA_BIN_PATH' in os.environ:
-    bin_path = os.path.abspath(os.environ['CUDA_BIN_PATH'])
-  if 'CUDA_LIB_PATH' in os.environ:
-    lib_path = os.path.abspath(os.environ['CUDA_LIB_PATH'])
-  if 'CUDA_INC_PATH' in os.environ:
-    inc_path = os.path.abspath(os.environ['CUDA_INC_PATH'])
-
-  return (bin_path,lib_path,inc_path)
-
-
-
-CUDASuffixes = ['.cu']
-
-# make a CUDAScanner for finding #includes
-# cuda uses the c preprocessor, so we can use the CScanner
-CUDAScanner = SCons.Scanner.C.CScanner()
-
-def add_common_nvcc_variables(env):
-  """
-  Add underlying common "NVIDIA CUDA compiler" variables that
-  are used by multiple builders.
-  """
-
-  # "NVCC common command line"
-  if not env.has_key('_NVCCCOMCOM'):
-    # nvcc needs '-I' prepended before each include path, regardless of platform
-    env['_NVCCWRAPCPPPATH'] = '${_concat("-I ", CPPPATH, "", __env__)}'
-    # prepend -Xcompiler before each flag
-    env['_NVCCWRAPCFLAGS'] =     '${_concat("-Xcompiler ", CFLAGS,     "", __env__)}'
-    env['_NVCCWRAPSHCFLAGS'] =   '${_concat("-Xcompiler ", SHCFLAGS,   "", __env__)}'
-    env['_NVCCWRAPCCFLAGS'] =   '${_concat("-Xcompiler ", CCFLAGS,   "", __env__)}'
-    env['_NVCCWRAPSHCCFLAGS'] = '${_concat("-Xcompiler ", SHCCFLAGS, "", __env__)}'
-    # assemble the common command line
-    env['_NVCCCOMCOM'] = '${_concat("-Xcompiler ", CPPFLAGS, "", __env__)} $_CPPDEFFLAGS $_NVCCWRAPCPPPATH'
-
-def generate(env):
-  """
-  Add Builders and construction variables for CUDA compilers to an Environment.
-  """
-
-  # create a builder that makes PTX files from .cu files
-  ptx_builder = SCons.Builder.Builder(action = '$NVCC -ptx $NVCCFLAGS $_NVCCWRAPCFLAGS $_NVCCWRAPCCFLAGS $_NVCCCOMCOM $SOURCES -o $TARGET',
-                                      emitter = {},
-                                      suffix = '.ptx',
-                                      src_suffix = CUDASuffixes)
-  env['BUILDERS']['PTXFile'] = ptx_builder
-
-  # create builders that make static & shared objects from .cu files
-  static_obj, shared_obj = SCons.Tool.createObjBuilders(env)
-
-  for suffix in CUDASuffixes:
-    # Add this suffix to the list of things buildable by Object
-    static_obj.add_action('$CUDAFILESUFFIX', '$NVCCCOM')
-    shared_obj.add_action('$CUDAFILESUFFIX', '$SHNVCCCOM')
-    static_obj.add_emitter(suffix, SCons.Defaults.StaticObjectEmitter)
-    shared_obj.add_emitter(suffix, SCons.Defaults.SharedObjectEmitter)
-
-    # Add this suffix to the list of things scannable
-    SCons.Tool.SourceFileScanner.add_scanner(suffix, CUDAScanner)
-
-  add_common_nvcc_variables(env)
-
-  # set the "CUDA Compiler Command" environment variable
-  # windows is picky about getting the full filename of the executable
-  if os.name == 'nt':
-    env['NVCC'] = 'nvcc.exe'
-    env['SHNVCC'] = 'nvcc.exe'
-  else:
-    env['NVCC'] = 'nvcc'
-    env['SHNVCC'] = 'nvcc'
-  
-  # set the include path, and pass both c compiler flags and c++ compiler flags
-  env['NVCCFLAGS'] = SCons.Util.CLVar('')
-  env['SHNVCCFLAGS'] = SCons.Util.CLVar('') + ' -shared'
-  
-  # 'NVCC Command'
-  env['NVCCCOM']   = '$NVCC -o $TARGET -c $NVCCFLAGS $_NVCCWRAPCFLAGS $_NVCCWRAPCCFLAGS $_NVCCCOMCOM $SOURCES'
-  env['SHNVCCCOM'] = '$SHNVCC -o $TARGET -c $SHNVCCFLAGS $_NVCCWRAPSHCFLAGS $_NVCCWRAPSHCCFLAGS $_NVCCCOMCOM $SOURCES'
-  
-  # the suffix of CUDA source files is '.cu'
-  env['CUDAFILESUFFIX'] = '.cu'
-
-  # XXX add code to generate builders for other miscellaneous
-  # CUDA files here, such as .gpu, etc.
-
-  # XXX intelligently detect location of nvcc and cuda libraries here
-  (bin_path,lib_path,inc_path) = get_cuda_paths()
-    
-  env.PrependENVPath('PATH', bin_path)
-
-def exists(env):
-  return env.Detect('nvcc')
-
diff --git a/site_scons/site_tools/zip.py b/site_scons/site_tools/zip.py
deleted file mode 100644
index 1c84eb6c3..000000000
--- a/site_scons/site_tools/zip.py
+++ /dev/null
@@ -1,101 +0,0 @@
-"""SCons.Tool.zip
-
-Tool-specific initialization for zip.
-
-There normally shouldn't be any need to import this module directly.
-It will usually be imported through the generic SCons.Tool.Tool()
-selection method.
-
-This version applies the patch from scons.tigris.org/issues/show_bug.cgi?id=2575
-
-"""
-
-#
-# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 The SCons Foundation
-#
-# Permission is hereby granted, free of charge, to any person obtaining
-# a copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish,
-# distribute, sublicense, and/or sell copies of the Software, and to
-# permit persons to whom the Software is furnished to do so, subject to
-# the following conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
-# KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
-# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
-# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-#
-
-__revision__ = "src/engine/SCons/Tool/zip.py 5134 2010/08/16 23:02:40 bdeegan"
-
-import os.path
-
-import SCons.Builder
-import SCons.Defaults
-import SCons.Node.FS
-import SCons.Util
-
-try:
-    import zipfile
-    internal_zip = 1
-except ImportError:
-    internal_zip = 0
-
-if internal_zip:
-    zipcompression = zipfile.ZIP_DEFLATED
-    def zip(target, source, env):
-        compression = env.get('ZIPCOMPRESSION', 0)
-        zf = zipfile.ZipFile(target[0].abspath, 'w', compression)
-        for s in source:
-            if s.isdir():
-                for dirpath, dirnames, filenames in os.walk(os.path.relpath(s.abspath)):
-                    for fname in filenames:
-                        path = os.path.join(dirpath, fname)
-                        if os.path.isfile(path):
-                            zf.write(path)
-            else:
-                zf.write(os.path.relpath(s.abspath))
-        zf.close()
-else:
-    zipcompression = 0
-    zip = "$ZIP $ZIPFLAGS ${TARGET.abspath} $SOURCES"
-
-
-zipAction = SCons.Action.Action(zip, varlist=['ZIPCOMPRESSION'])
-
-ZipBuilder = SCons.Builder.Builder(action = SCons.Action.Action('$ZIPCOM', '$ZIPCOMSTR'),
-                                   source_factory = SCons.Node.FS.Entry,
-                                   source_scanner = SCons.Defaults.DirScanner,
-                                   suffix = '$ZIPSUFFIX',
-                                   multi = 1)
-
-
-def generate(env):
-    """Add Builders and construction variables for zip to an Environment."""
-    try:
-        bld = env['BUILDERS']['Zip']
-    except KeyError:
-        bld = ZipBuilder
-        env['BUILDERS']['Zip'] = bld
-
-    env['ZIP']        = 'zip'
-    env['ZIPFLAGS']   = SCons.Util.CLVar('')
-    env['ZIPCOM']     = zipAction
-    env['ZIPCOMPRESSION'] =  zipcompression
-    env['ZIPSUFFIX']  = '.zip'
-
-def exists(env):
-    return internal_zip or env.Detect('zip')
-
-# Local Variables:
-# tab-width:4
-# indent-tabs-mode:nil
-# End:
-# vim: set expandtab tabstop=4 shiftwidth=4:
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
new file mode 100644
index 000000000..0f0749c4e
--- /dev/null
+++ b/testing/CMakeLists.txt
@@ -0,0 +1,169 @@
+# Create meta targets that build all tests for a single configuration:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+  set(config_meta_target ${config_prefix}.tests)
+  add_custom_target(${config_meta_target})
+  add_dependencies(${config_prefix}.all ${config_meta_target})
+endforeach()
+
+# Update flags to reflect RDC options. See note in ThrustCudaConfig.cmake --
+# these flag variables behave unintuitively:
+if (THRUST_ENABLE_TESTS_WITH_RDC)
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+else()
+  set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_NO_RDC}")
+endif()
+
+# Generate testing framework libraries:
+add_subdirectory(unittest)
+
+# Some tests only support certain host.device configurations. Use this macro to
+# declare allowed configurations. If not specified, all host.device config are
+# used.
+set(restricted_tests)
+macro(thrust_declare_test_restrictions test_name)
+  list(APPEND restricted_tests ${test_name})
+  list(APPEND ${test_name}_host.device_allowed ${ARGN})
+endmacro()
+
+# Async/future/event tests only support the CUDA backend:
+thrust_declare_test_restrictions(async_copy        CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_for_each    CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_reduce      CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_reduce_into CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_sort        CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(async_transform   CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(event             CPP.CUDA OMP.CUDA TBB.CUDA)
+thrust_declare_test_restrictions(future            CPP.CUDA OMP.CUDA TBB.CUDA)
+
+# This test is incompatible with TBB and OMP, since it requires special per-device
+# handling to process exceptions in a device function, which is only implemented
+# for CUDA.
+thrust_declare_test_restrictions(unittest_static_assert CPP.CPP CPP.CUDA)
+
+# In the TBB backend, reduce_by_key does not currently work with transform_output_iterator
+# https://github.com/NVIDIA/thrust/issues/1811
+thrust_declare_test_restrictions(transform_output_iterator_reduce_by_key CPP.CPP CPP.OMP CPP.CUDA)
+
+## thrust_add_test
+#
+# Add a test executable and register it with ctest.
+#
+# target_name_var: Variable name to overwrite with the name of the test
+#   target. Useful for post-processing target information per-backend.
+# test_name: The name of the test minus "<config_prefix>.test." For example,
+#   testing/vector.cu will be "vector", and testing/cuda/copy.cu will be
+#   "cuda.copy".
+# test_src: The source file that implements the test.
+# thrust_target: The reference thrust target with configuration information.
+#
+function(thrust_add_test target_name_var test_name test_src thrust_target)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  # Wrap the .cu file in .cpp for non-CUDA backends
+  if ("CUDA" STREQUAL "${config_device}")
+    set(real_test_src "${test_src}")
+  else()
+    thrust_wrap_cu_in_cpp(real_test_src "${test_src}" ${thrust_target})
+  endif()
+
+  # The actual name of the test's target:
+  set(test_target ${config_prefix}.test.${test_name})
+  set(${target_name_var} ${test_target} PARENT_SCOPE)
+
+  # Related target names:
+  set(config_framework_target ${config_prefix}.test.framework)
+  set(config_meta_target ${config_prefix}.tests)
+  set(test_meta_target thrust.all.test.${test_name})
+
+  add_executable(${test_target} "${real_test_src}")
+  target_link_libraries(${test_target} PRIVATE ${config_framework_target})
+  target_include_directories(${test_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${test_target} ${thrust_target})
+
+  if (NOT "Clang" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+    target_compile_definitions(${test_target} PRIVATE THRUST_TEST_DEVICE_SIDE)
+  endif()
+
+  thrust_fix_clang_nvcc_build_for(${test_target})
+
+  # Add to the active configuration's meta target
+  add_dependencies(${config_meta_target} ${test_target})
+
+  # Meta target that builds tests with this name for all configurations:
+  if (NOT TARGET ${test_meta_target})
+    add_custom_target(${test_meta_target})
+  endif()
+  add_dependencies(${test_meta_target} ${test_target})
+
+  add_test(NAME ${test_target}
+    COMMAND "${CMAKE_COMMAND}"
+    "-DTHRUST_BINARY=$<TARGET_FILE:${test_target}>"
+    "-DTHRUST_SOURCE=${Thrust_SOURCE_DIR}"
+    -P "${Thrust_SOURCE_DIR}/cmake/ThrustRunTest.cmake"
+  )
+
+  # Run OMP/TBB tests in serial. Multiple OMP processes will massively
+  # oversubscribe the machine with GCC's OMP, and we want to test these with
+  # the full CPU available to each unit test.
+  set(config_systems ${config_host} ${config_device})
+  if (("OMP" IN_LIST config_systems) OR ("TBB" IN_LIST config_systems))
+    set_tests_properties(${test_target} PROPERTIES RUN_SERIAL ON)
+  endif()
+
+  # Check for per-test script. Script will be included in the current scope
+  # to allow custom property modifications.
+  get_filename_component(test_cmake_script "${test_src}" NAME_WLE)
+  set(test_cmake_script "${CMAKE_CURRENT_LIST_DIR}/${test_cmake_script}.cmake")
+  # Use a glob so we can detect if this changes:
+  file(GLOB test_cmake_script
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${test_cmake_script}"
+  )
+  if (test_cmake_script) # Will be non-empty only if the script exists
+    include("${test_cmake_script}")
+  endif()
+endfunction()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# Add common tests to all configs:
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_host ${thrust_target} HOST)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+
+    # Is this test restricted to only certain host/device combinations?
+    if(${test_name} IN_LIST restricted_tests)
+      # Is the current host/device combination supported?
+      if (NOT "${config_host}.${config_device}" IN_LIST
+            ${test_name}_host.device_allowed)
+        continue()
+      endif()
+    endif()
+
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+
+    if (THRUST_ENABLE_TESTS_WITH_RDC AND ("CUDA" STREQUAL "${config_device}"))
+      thrust_enable_rdc_for_cuda_target(${test_target})
+    endif()
+  endforeach()
+endforeach()
+
+# Add specialized tests:
+add_subdirectory(async)
+add_subdirectory(cmake)
+add_subdirectory(cpp)
+add_subdirectory(cuda)
+add_subdirectory(omp)
+add_subdirectory(regression)
diff --git a/testing/SConscript b/testing/SConscript
deleted file mode 100644
index 4ed12a9cd..000000000
--- a/testing/SConscript
+++ /dev/null
@@ -1,60 +0,0 @@
-Import('env')
-
-# clone the parent's env so that we do not modify it
-my_env = env.Clone()
-
-vars = Variables()
-
-# add a variable to filter source files by a regex
-vars.Add('tests', 'Filter test files using a regex', '.')
-
-# update variables
-my_env.Help(vars.GenerateHelpText(env))
-vars.Update(my_env)
-
-# populate the environment
-
-# with cl we have to do /bigobj
-if my_env.subst('$CXX') == 'cl':
-  my_env.Append(CPPFLAGS = '/bigobj')
-
-# #include the current directory
-my_env.Append(CPPPATH = Dir('.').srcnode())
-
-# find all .cus & .cpps
-sources = []
-extensions  = ['*.cu', '*.cpp']
-
-# gather sources in the current directorie
-for ext in extensions:
-  sources.extend(my_env.Glob(ext))
-
-# gather sources from directories
-sources.extend(SConscript('backend/SConscript', exports='env'))
-
-# filter sources
-import re
-filter_exp = 'int main|driver_instance|{0}'.format(my_env['tests'])
-pattern = re.compile(filter_exp)
-def test_filter(src):
-  return pattern.search(src.get_contents())
-
-sources = filter(test_filter, sources)
-
-tester = my_env.Program('tester', sources)
-
-# create a 'unit_tests' alias
-unit_tests_alias = my_env.Alias('unit_tests', [tester])
-
-# add the verbose tester to the 'run_unit_tests' alias
-run_unit_tests_alias = my_env.Alias('run_unit_tests', [tester], tester[0].abspath + ' --verbose')
-
-# always build the 'run_unit_tests' target whether or not it needs it
-my_env.AlwaysBuild(run_unit_tests_alias)
-
-# add the unit tests alias to the 'run_tests' alias
-my_env.Alias('run_tests', [tester], tester[0].abspath)
-
-# build children
-SConscript('trivial_tests/SConscript', exports='env')
-
diff --git a/testing/adjacent_difference.cu b/testing/adjacent_difference.cu
index 7a8b000f5..5f97ea350 100644
--- a/testing/adjacent_difference.cu
+++ b/testing/adjacent_difference.cu
@@ -2,6 +2,8 @@
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 template <class Vector>
 void TestAdjacentDifferenceSimple(void)
@@ -13,21 +15,21 @@ void TestAdjacentDifferenceSimple(void)
     input[0] = 1; input[1] = 4; input[2] = 6;
 
     typename Vector::iterator result;
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin());
 
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T(1));
     ASSERT_EQUAL(output[1], T(3));
     ASSERT_EQUAL(output[2], T(2));
-    
+
     result = thrust::adjacent_difference(input.begin(), input.end(), output.begin(), thrust::plus<T>());
-    
+
     ASSERT_EQUAL(result - output.begin(), 3);
     ASSERT_EQUAL(output[0], T( 1));
     ASSERT_EQUAL(output[1], T( 5));
     ASSERT_EQUAL(output[2], T(10));
-    
+
     // test in-place operation, result and first are permitted to be the same
     result = thrust::adjacent_difference(input.begin(), input.end(), input.begin());
 
@@ -54,23 +56,23 @@ void TestAdjacentDifference(const size_t n)
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin());
 
-    ASSERT_EQUAL(h_result - h_output.begin(), n);
-    ASSERT_EQUAL(d_result - d_output.begin(), n);
+    ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
 
-    ASSERT_EQUAL(h_result - h_output.begin(), n);
-    ASSERT_EQUAL(d_result - d_output.begin(), n);
+    ASSERT_EQUAL(std::size_t(h_result - h_output.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_output.begin()), n);
     ASSERT_EQUAL(h_output, d_output);
-    
+
     // in-place operation
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
 
-    ASSERT_EQUAL(h_result - h_input.begin(), n);
-    ASSERT_EQUAL(d_result - d_input.begin(), n);
+    ASSERT_EQUAL(std::size_t(h_result - h_input.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_input.begin()), n);
     ASSERT_EQUAL(h_input, h_output); //computed previously
     ASSERT_EQUAL(d_input, d_output); //computed previously
 }
@@ -90,13 +92,13 @@ void TestAdjacentDifferenceInPlaceWithRelatedIteratorTypes(const size_t n)
 
     h_result = thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
-    
+
     // in-place operation with different iterator types
     h_result = thrust::adjacent_difference(h_input.cbegin(), h_input.cend(), h_input.begin(), thrust::plus<T>());
     d_result = thrust::adjacent_difference(d_input.cbegin(), d_input.cend(), d_input.begin(), thrust::plus<T>());
 
-    ASSERT_EQUAL(h_result - h_input.begin(), n);
-    ASSERT_EQUAL(d_result - d_input.begin(), n);
+    ASSERT_EQUAL(std::size_t(h_result - h_input.begin()), n);
+    ASSERT_EQUAL(std::size_t(d_result - d_input.begin()), n);
     ASSERT_EQUAL(h_output, h_input); // reference computed previously
     ASSERT_EQUAL(d_output, d_input); // reference computed previously
 }
@@ -159,4 +161,3 @@ void TestAdjacentDifferenceDispatchImplicit()
     ASSERT_EQUAL(13, d_input.front());
 }
 DECLARE_UNITTEST(TestAdjacentDifferenceDispatchImplicit);
-
diff --git a/testing/advance.cu b/testing/advance.cu
index 99900b6a9..0860ef598 100644
--- a/testing/advance.cu
+++ b/testing/advance.cu
@@ -5,27 +5,89 @@
 // TODO expand this with other iterator types (forward, bidirectional, etc.)
 
 template <typename Vector>
-void TestAdvance(void)
+void TestAdvance()
 {
     typedef typename Vector::value_type T;
     typedef typename Vector::iterator Iterator;
 
-    Vector v(100);
+    Vector v(10);
     thrust::sequence(v.begin(), v.end());
 
     Iterator i = v.begin();
 
-    thrust::advance(i, 7);
+    thrust::advance(i, 1);
 
-    ASSERT_EQUAL(*i, T(7));
+    ASSERT_EQUAL(*i, T(1));
     
-    thrust::advance(i, 13);
+    thrust::advance(i, 8);
 
-    ASSERT_EQUAL(*i, T(20));
+    ASSERT_EQUAL(*i, T(9));
     
-    thrust::advance(i, -10);
+    thrust::advance(i, -4);
 
-    ASSERT_EQUAL(*i, T(10));
+    ASSERT_EQUAL(*i, T(5));
 }
 DECLARE_VECTOR_UNITTEST(TestAdvance);
 
+template <typename Vector>
+void TestNext()
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(10);
+    thrust::sequence(v.begin(), v.end());
+
+    Iterator const i0 = v.begin();
+
+    Iterator const i1 = thrust::next(i0);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    
+    Iterator const i2 = thrust::next(i1, 8);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    ASSERT_EQUAL(*i2, T(9));
+    
+    Iterator const i3 = thrust::next(i2, -4);
+
+    ASSERT_EQUAL(*i0, T(0));
+    ASSERT_EQUAL(*i1, T(1));
+    ASSERT_EQUAL(*i2, T(9));
+    ASSERT_EQUAL(*i3, T(5));
+}
+DECLARE_VECTOR_UNITTEST(TestNext);
+
+template <typename Vector>
+void TestPrev()
+{
+    typedef typename Vector::value_type T;
+    typedef typename Vector::iterator Iterator;
+
+    Vector v(10);
+    thrust::sequence(v.begin(), v.end());
+
+    Iterator const i0 = v.end();
+
+    Iterator const i1 = thrust::prev(i0);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    
+    Iterator const i2 = thrust::prev(i1, 8);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    ASSERT_EQUAL(*i2, T(1));
+    
+    Iterator const i3 = thrust::prev(i2, -4);
+
+    ASSERT_EQUAL_QUIET(i0, v.end());
+    ASSERT_EQUAL(*i1, T(9));
+    ASSERT_EQUAL(*i2, T(1));
+    ASSERT_EQUAL(*i3, T(5));
+}
+DECLARE_VECTOR_UNITTEST(TestPrev);
+
diff --git a/testing/alignment.cu b/testing/alignment.cu
new file mode 100644
index 000000000..e55df2e96
--- /dev/null
+++ b/testing/alignment.cu
@@ -0,0 +1,360 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/alignment.h>
+
+struct alignof_mock_0
+{
+    char a;
+    char b;
+}; // size: 2 * sizeof(char), alignment: sizeof(char)
+
+struct alignof_mock_1
+{
+    int n;
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_2
+{
+    int n;
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_3
+{
+    char c;
+    // sizeof(int) - sizeof(char) bytes of padding
+    int n;
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_4
+{
+    char c0;
+    // sizeof(int) - sizeof(char) bytes of padding
+    int n;
+    char c1;
+    // sizeof(int) - sizeof(char) bytes of padding
+}; // size: 3 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_5
+{
+    char c0;
+    char c1;
+    // sizeof(int) - 2 * sizeof(char) bytes of padding
+    int n;
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+struct alignof_mock_6
+{
+    int n;
+    char c0;
+    char c1;
+    // sizeof(int) - 2 * sizeof(char) bytes of padding
+}; // size: 2 * sizeof(int), alignment: sizeof(int)
+
+void test_alignof_mocks_sizes()
+{
+    ASSERT_EQUAL(sizeof(alignof_mock_0), 2 * sizeof(char));
+    ASSERT_EQUAL(sizeof(alignof_mock_1), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_2), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_3), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_4), 3 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_5), 2 * sizeof(int));
+    ASSERT_EQUAL(sizeof(alignof_mock_6), 2 * sizeof(int));
+}
+DECLARE_UNITTEST(test_alignof_mocks_sizes);
+
+void test_alignof()
+{
+    ASSERT_EQUAL(THRUST_ALIGNOF(bool)                  , sizeof(bool));
+    ASSERT_EQUAL(THRUST_ALIGNOF(signed char)           , sizeof(signed char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned char)         , sizeof(unsigned char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(char)                  , sizeof(char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(short int)             , sizeof(short int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned short int)    , sizeof(unsigned short int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(int)                   , sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned int)          , sizeof(unsigned int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long int)              , sizeof(long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned long int)     , sizeof(unsigned long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long long int)         , sizeof(long long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(unsigned long long int), sizeof(unsigned long long int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(float)                 , sizeof(float));
+    ASSERT_EQUAL(THRUST_ALIGNOF(double)                , sizeof(double));
+    ASSERT_EQUAL(THRUST_ALIGNOF(long double)           , sizeof(long double));
+
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_0), sizeof(char));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_1), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_2), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_3), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_4), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_5), sizeof(int));
+    ASSERT_EQUAL(THRUST_ALIGNOF(alignof_mock_6), sizeof(int));
+}
+DECLARE_UNITTEST(test_alignof);
+
+void test_alignment_of()
+{
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<bool>::value
+      , sizeof(bool)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<signed char>::value
+      , sizeof(signed char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned char>::value
+      , sizeof(unsigned char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<char>::value
+      , sizeof(char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<short int>::value
+      , sizeof(short int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned short int>::value
+      , sizeof(unsigned short int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<int>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned int>::value
+      , sizeof(unsigned int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long int>::value
+      , sizeof(long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned long int>::value
+      , sizeof(unsigned long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long long int>::value
+      , sizeof(long long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<unsigned long long int>::value
+      , sizeof(unsigned long long int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<float>::value
+      , sizeof(float)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<double>::value
+      , sizeof(double)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<long double>::value
+      , sizeof(long double)
+    );
+
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_0>::value
+      , sizeof(char)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_1>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_2>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_3>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_4>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_5>::value
+      , sizeof(int)
+    );
+    ASSERT_EQUAL(
+        thrust::detail::alignment_of<alignof_mock_6>::value
+      , sizeof(int)
+    );
+}
+DECLARE_UNITTEST(test_alignment_of);
+
+template <std::size_t Align>
+void test_aligned_type_instantiation()
+{
+    typedef typename thrust::detail::aligned_type<Align>::type type;
+    ASSERT_GEQUAL(sizeof(type), 1lu);
+    ASSERT_EQUAL(THRUST_ALIGNOF(type), Align);
+    ASSERT_EQUAL(thrust::detail::alignment_of<type>::value, Align);
+}
+
+void test_aligned_type()
+{
+    test_aligned_type_instantiation<1>();
+    test_aligned_type_instantiation<2>();
+    test_aligned_type_instantiation<4>();
+    test_aligned_type_instantiation<8>();
+    test_aligned_type_instantiation<16>();
+    test_aligned_type_instantiation<32>();
+    test_aligned_type_instantiation<64>();
+    test_aligned_type_instantiation<128>();
+}
+DECLARE_UNITTEST(test_aligned_type);
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation(thrust::detail::true_type /* Align is valid */)
+{
+    typedef typename thrust::detail::aligned_storage<Len, Align>::type type;
+    ASSERT_GEQUAL(sizeof(type), Len);
+    ASSERT_EQUAL(THRUST_ALIGNOF(type), Align);
+    ASSERT_EQUAL(thrust::detail::alignment_of<type>::value, Align);
+}
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation(thrust::detail::false_type /* Align is invalid */)
+{
+  // no-op -- alignment is > max_align_t and MSVC complains loudly.
+}
+
+template <std::size_t Len, std::size_t Align>
+void test_aligned_storage_instantiation()
+{
+  typedef thrust::detail::integral_constant<
+      bool, Align <= THRUST_ALIGNOF(thrust::detail::max_align_t)>
+      ValidAlign;
+  test_aligned_storage_instantiation<Len, Align>(ValidAlign());
+}
+
+template <std::size_t Len>
+void test_aligned_storage_size()
+{
+    test_aligned_storage_instantiation<Len, 1>();
+    test_aligned_storage_instantiation<Len, 2>();
+    test_aligned_storage_instantiation<Len, 4>();
+    test_aligned_storage_instantiation<Len, 8>();
+    test_aligned_storage_instantiation<Len, 16>();
+    test_aligned_storage_instantiation<Len, 32>();
+    test_aligned_storage_instantiation<Len, 64>();
+    test_aligned_storage_instantiation<Len, 128>();
+}
+
+void test_aligned_storage()
+{
+    test_aligned_storage_size<1>();
+    test_aligned_storage_size<2>();
+    test_aligned_storage_size<4>();
+    test_aligned_storage_size<8>();
+    test_aligned_storage_size<16>();
+    test_aligned_storage_size<32>();
+    test_aligned_storage_size<64>();
+    test_aligned_storage_size<128>();
+    test_aligned_storage_size<256>();
+    test_aligned_storage_size<512>();
+    test_aligned_storage_size<1024>();
+    test_aligned_storage_size<2048>();
+    test_aligned_storage_size<4096>();
+    test_aligned_storage_size<8192>();
+    test_aligned_storage_size<16384>();
+
+    test_aligned_storage_size<3>();
+    test_aligned_storage_size<5>();
+    test_aligned_storage_size<7>();
+
+    test_aligned_storage_size<17>();
+    test_aligned_storage_size<42>();
+
+    test_aligned_storage_size<10000>();
+}
+DECLARE_UNITTEST(test_aligned_storage);
+
+void test_max_align_t()
+{
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(bool)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(signed char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(char)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(short int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned short int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(unsigned long long int)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(float)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(double)
+    );
+    ASSERT_GEQUAL(
+        THRUST_ALIGNOF(thrust::detail::max_align_t)
+      , THRUST_ALIGNOF(long double)
+    );
+}
+DECLARE_UNITTEST(test_max_align_t);
+
+void test_aligned_reinterpret_cast()
+{
+    thrust::detail::aligned_type<1>* a1 = 0;
+
+    thrust::detail::aligned_type<2>* a2 = 0;
+
+    // Cast to type with stricter (larger) alignment requirement.
+    a2 = thrust::detail::aligned_reinterpret_cast<
+        thrust::detail::aligned_type<2>*
+    >(a1);
+
+    // Cast to type with less strict (smaller) alignment requirement.
+    a1 = thrust::detail::aligned_reinterpret_cast<
+        thrust::detail::aligned_type<1>*
+    >(a2);
+}
+DECLARE_UNITTEST(test_aligned_reinterpret_cast);
+
diff --git a/testing/allocator.cu b/testing/allocator.cu
index 0026f9acb..175685ed0 100644
--- a/testing/allocator.cu
+++ b/testing/allocator.cu
@@ -1,16 +1,20 @@
 #include <unittest/unittest.h>
+#include <thrust/detail/config.h>
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/system/cpp/vector.h>
+
+#include <nv/target>
+
 #include <memory>
 
+template <typename T>
 struct my_allocator_with_custom_construct1
-  : thrust::device_malloc_allocator<int>
+  : thrust::device_malloc_allocator<T>
 {
   __host__ __device__
   my_allocator_with_custom_construct1()
   {}
 
-  template<typename T>
   __host__ __device__
   void construct(T *p)
   {
@@ -18,24 +22,25 @@ struct my_allocator_with_custom_construct1
   }
 };
 
-void TestAllocatorCustomDefaultConstruct()
+template <typename T>
+void TestAllocatorCustomDefaultConstruct(size_t n)
 {
-  thrust::device_vector<int> ref(10,13);
-  thrust::device_vector<int, my_allocator_with_custom_construct1> vec(10);
+  thrust::device_vector<T> ref(n, 13);
+  thrust::device_vector<T, my_allocator_with_custom_construct1<T> > vec(n);
 
   ASSERT_EQUAL_QUIET(ref, vec);
 }
-DECLARE_UNITTEST(TestAllocatorCustomDefaultConstruct);
-
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDefaultConstruct);
 
+template <typename T>
 struct my_allocator_with_custom_construct2
-  : thrust::device_malloc_allocator<int>
+  : thrust::device_malloc_allocator<T>
 {
   __host__ __device__
   my_allocator_with_custom_construct2()
   {}
 
-  template<typename T, typename Arg>
+  template <typename Arg>
   __host__ __device__
   void construct(T *p, const Arg &)
   {
@@ -43,23 +48,29 @@ struct my_allocator_with_custom_construct2
   }
 };
 
-void TestAllocatorCustomCopyConstruct()
+template <typename T>
+void TestAllocatorCustomCopyConstruct(size_t n)
 {
-  thrust::device_vector<int> ref(10,13);
-  thrust::device_vector<int> copy_from(10,7);
-  thrust::device_vector<int, my_allocator_with_custom_construct2> vec(copy_from.begin(), copy_from.end());
+  thrust::device_vector<T> ref(n, 13);
+  thrust::device_vector<T> copy_from(n, 7);
+  thrust::device_vector<T, my_allocator_with_custom_construct2<T> >
+    vec(copy_from.begin(), copy_from.end());
 
   ASSERT_EQUAL_QUIET(ref, vec);
 }
-DECLARE_UNITTEST(TestAllocatorCustomCopyConstruct);
-
-static int g_state;
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomCopyConstruct);
 
+template <typename T>
 struct my_allocator_with_custom_destroy
 {
-  typedef int         value_type;
-  typedef int &       reference;
-  typedef const int & const_reference;
+  // This is only used with thrust::cpp::vector:
+  using system_type = thrust::cpp::tag;
+
+  using value_type = T;
+  using reference = T &;
+  using const_reference = const T &;
+
+  static bool g_state;
 
   __host__
   my_allocator_with_custom_destroy(){}
@@ -72,13 +83,10 @@ struct my_allocator_with_custom_destroy
   __host__
   ~my_allocator_with_custom_destroy(){}
 
-  template<typename T>
   __host__ __device__
-  void destroy(T *p)
+  void destroy(T *)
   {
-#if !__CUDA_ARCH__
-    g_state = 13;
-#endif
+    NV_IF_TARGET(NV_IS_HOST, (g_state = true;));
   }
 
   value_type *allocate(std::ptrdiff_t n)
@@ -90,32 +98,51 @@ struct my_allocator_with_custom_destroy
   {
     use_me_to_alloc.deallocate(ptr,n);
   }
-  
+
+  bool operator==(const my_allocator_with_custom_destroy &) const
+  {
+    return true;
+  }
+
+  bool operator!=(const my_allocator_with_custom_destroy &other) const
+  {
+    return !(*this == other);
+  }
+
+  typedef thrust::detail::true_type is_always_equal;
+
   // use composition rather than inheritance
   // to avoid inheriting std::allocator's member
-  // function construct
-  std::allocator<int> use_me_to_alloc;
+  // function destroy
+  std::allocator<T> use_me_to_alloc;
 };
 
-void TestAllocatorCustomDestroy()
+template <typename T>
+bool my_allocator_with_custom_destroy<T>::g_state = false;
+
+template <typename T>
+void TestAllocatorCustomDestroy(size_t n)
 {
-  thrust::cpp::vector<int, my_allocator_with_custom_destroy> vec(10);
+  my_allocator_with_custom_destroy<T>::g_state = false;
 
-  // destroy everything
-  vec.shrink_to_fit();
+  {
+    thrust::cpp::vector<T, my_allocator_with_custom_destroy<T> > vec(n);
+  } // destroy everything
 
-  ASSERT_EQUAL(13, g_state);
+  // state should only be true when there are values to destroy:
+  ASSERT_EQUAL(n > 0, my_allocator_with_custom_destroy<T>::g_state);
 }
-DECLARE_UNITTEST(TestAllocatorCustomDestroy);
+DECLARE_VARIABLE_UNITTEST(TestAllocatorCustomDestroy);
 
+template <typename T>
 struct my_minimal_allocator
 {
-  typedef int         value_type;
+  typedef T         value_type;
 
   // XXX ideally, we shouldn't require
   //     these two typedefs
-  typedef int &       reference;
-  typedef const int & const_reference;
+  typedef T &       reference;
+  typedef const T & const_reference;
 
   __host__
   my_minimal_allocator(){}
@@ -138,18 +165,94 @@ struct my_minimal_allocator
     use_me_to_alloc.deallocate(ptr,n);
   }
 
-  std::allocator<int> use_me_to_alloc;
+  std::allocator<T> use_me_to_alloc;
 };
 
-void TestAllocatorMinimal()
+template <typename T>
+void TestAllocatorMinimal(size_t n)
 {
-  thrust::cpp::vector<int, my_minimal_allocator> vec(10, 13);
+  thrust::cpp::vector<int, my_minimal_allocator<int> > vec(n, 13);
 
   // XXX copy to h_vec because ASSERT_EQUAL doesn't know about cpp::vector
   thrust::host_vector<int> h_vec(vec.begin(), vec.end());
-  thrust::host_vector<int> ref(10, 13);
+  thrust::host_vector<int> ref(n, 13);
 
   ASSERT_EQUAL(ref, h_vec);
 }
-DECLARE_UNITTEST(TestAllocatorMinimal);
+DECLARE_VARIABLE_UNITTEST(TestAllocatorMinimal);
 
+void TestAllocatorTraitsRebind()
+{
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_traits<float>::other,
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<float>
+      >
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_traits<float>::other,
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<float>
+      >
+    >::value),
+    true
+  );
+}
+DECLARE_UNITTEST(TestAllocatorTraitsRebind);
+
+void TestAllocatorTraitsRebindCpp11()
+{
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_alloc<float>,
+      thrust::device_malloc_allocator<float>
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_alloc<float>,
+      my_minimal_allocator<float>
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<int>
+      >::template rebind_traits<float>,
+      typename thrust::detail::allocator_traits<
+        thrust::device_malloc_allocator<float>
+      >
+    >::value),
+    true
+  );
+
+  ASSERT_EQUAL(
+    (thrust::detail::is_same<
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<int>
+      >::template rebind_traits<float>,
+      typename thrust::detail::allocator_traits<
+        my_minimal_allocator<float>
+      >
+    >::value),
+    true
+  );
+}
+DECLARE_UNITTEST(TestAllocatorTraitsRebindCpp11);
diff --git a/testing/allocator_aware_policies.cu b/testing/allocator_aware_policies.cu
new file mode 100644
index 000000000..0a737c3ce
--- /dev/null
+++ b/testing/allocator_aware_policies.cu
@@ -0,0 +1,156 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/seq.h>
+#include <thrust/system/cpp/detail/par.h>
+#include <thrust/system/omp/detail/par.h>
+#include <thrust/system/tbb/detail/par.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#include <thrust/system/cuda/detail/par.h>
+#endif
+
+template<typename T>
+struct test_allocator_t
+{
+};
+
+test_allocator_t<int> test_allocator = test_allocator_t<int>();
+const test_allocator_t<int> const_test_allocator = test_allocator_t<int>();
+
+struct test_memory_resource_t final : thrust::mr::memory_resource<>
+{
+    void * do_allocate(std::size_t size, std::size_t) override
+    {
+        return reinterpret_cast<void *>(size);
+    }
+
+    void do_deallocate(void * ptr, std::size_t size, std::size_t) override
+    {
+        ASSERT_EQUAL(ptr, reinterpret_cast<void *>(size));
+    }
+} test_memory_resource;
+
+template<typename Policy, template <typename> class CRTPBase>
+struct policy_info
+{
+    typedef Policy policy;
+
+    template<template <typename, template <typename> class> class Template, typename Argument>
+    struct apply_base_second
+    {
+        typedef Template<Argument, CRTPBase> type;
+    };
+};
+
+template<typename PolicyInfo>
+struct TestAllocatorAttachment
+{
+    template<typename Expected, typename T>
+    static void assert_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator,
+                    Expected
+                >::type
+            >::value), true);
+    }
+
+    template<typename ExpectedResource, typename T>
+    static void assert_npa_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator,
+                    thrust::mr::allocator<
+                        thrust::detail::max_align_t,
+                        ExpectedResource
+                    >
+                >::type
+            >::value), true);
+    }
+
+    template<typename Policy>
+    void test_temporary_allocation_valid(Policy policy)
+    {
+        using thrust::detail::get_temporary_buffer;
+
+        return_temporary_buffer(
+            policy,
+            get_temporary_buffer<int>(
+                policy,
+                123
+            ).first,
+            123
+        );
+    }
+
+    void operator()()
+    {
+        typename PolicyInfo::policy policy;
+
+        // test correctness of attachment
+        assert_correct<test_allocator_t<int> >(policy(test_allocator_t<int>()));
+        assert_correct<test_allocator_t<int>&>(policy(test_allocator));
+        assert_correct<test_allocator_t<int> >(policy(const_test_allocator));
+
+        assert_npa_correct<test_memory_resource_t>(policy(&test_memory_resource));
+
+        // test whether the resulting policy is actually usable
+        // a real allocator is necessary here, unlike above
+        std::allocator<int> alloc;
+        const std::allocator<int> const_alloc;
+
+        test_temporary_allocation_valid(policy(std::allocator<int>()));
+        test_temporary_allocation_valid(policy(alloc));
+        test_temporary_allocation_valid(policy(const_alloc));
+        test_temporary_allocation_valid(policy(&test_memory_resource));
+
+        #if THRUST_CPP_DIALECT >= 2011
+        test_temporary_allocation_valid(policy(std::allocator<int>()).after(1));
+        test_temporary_allocation_valid(policy(alloc).after(1));
+        test_temporary_allocation_valid(policy(const_alloc).after(1));
+        #endif
+    }
+};
+
+typedef policy_info<
+    thrust::detail::seq_t,
+    thrust::system::detail::sequential::execution_policy
+> sequential_info;
+typedef policy_info<
+    thrust::system::cpp::detail::par_t,
+    thrust::system::cpp::detail::execution_policy
+> cpp_par_info;
+typedef policy_info<
+    thrust::system::omp::detail::par_t,
+    thrust::system::omp::detail::execution_policy
+> omp_par_info;
+typedef policy_info<
+    thrust::system::tbb::detail::par_t,
+    thrust::system::tbb::detail::execution_policy
+> tbb_par_info;
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+typedef policy_info<
+    thrust::system::cuda::detail::par_t,
+    thrust::cuda_cub::execute_on_stream_base
+> cuda_par_info;
+#endif
+
+SimpleUnitTest<
+    TestAllocatorAttachment,
+    unittest::type_list<
+        sequential_info,
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cuda_par_info,
+#endif
+        cpp_par_info,
+        omp_par_info,
+        tbb_par_info
+    >
+> TestAllocatorAttachmentInstance;
diff --git a/testing/async/CMakeLists.txt b/testing/async/CMakeLists.txt
new file mode 100644
index 000000000..00d50f097
--- /dev/null
+++ b/testing/async/CMakeLists.txt
@@ -0,0 +1,80 @@
+# The async tests perform a large amount of codegen, making them expensive to
+# build and test. To keep compilation and runtimes manageable, the tests are
+# broken up into many files per algorithm to enable parallelism during
+# compilation and testing. The structure of these test directories are:
+#
+# thrust/testing/async/<algorithm_name>/<unit_test>.cu
+#
+# These generate executables and CTest tests named
+# ${config_prefix}.test.async.<algorithm_name>.<unit_test>.
+
+# The async tests only support CUDA enabled configs. Create a list of valid
+# thrust targets:
+set(cuda_configs)
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (config_device STREQUAL CUDA)
+    list(APPEND cuda_configs ${thrust_target})
+  endif()
+endforeach()
+
+list(LENGTH cuda_configs num_cuda_configs)
+if (num_cuda_configs EQUAL 0)
+  return() # No valid configs found, nothing to do.
+endif()
+
+# Process a single algorithm directory, adding all .cu/cpp files as tests for
+# each valid backend. algo_name is the name of the subdir (<algorithm_name>
+# above) and is used for naming the executable/targets.
+function(thrust_add_async_test_dir algo_name)
+  file(GLOB test_srcs
+    RELATIVE "${CMAKE_CURRENT_LIST_DIR}"
+    CONFIGURE_DEPENDS
+    "${algo_name}/*.cu"
+    "${algo_name}/*.cpp"
+  )
+
+  # Per-algorithm, all-config metatarget: thrust.all.test.async.[algo].all
+  set(algo_meta_target thrust.all.test.async.${algo_name}.all)
+  add_custom_target(${algo_meta_target})
+
+  foreach(thrust_target IN LISTS cuda_configs)
+    thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+    # Per-algorithm, per-config metatarget: thrust.[config].test.async.[algo].all
+    set(algo_config_meta_target ${config_prefix}.test.async.${algo_name}.all)
+    add_custom_target(${algo_config_meta_target})
+    add_dependencies(${algo_meta_target} ${algo_config_meta_target})
+
+    foreach(test_src IN LISTS test_srcs)
+      get_filename_component(test_name "${test_src}" NAME_WLE)
+      string(PREPEND test_name async.${algo_name}.)
+
+      thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+      if(THRUST_ENABLE_TESTS_WITH_RDC)
+        thrust_enable_rdc_for_cuda_target(${test_target})
+      endif()
+
+      add_dependencies(${algo_config_meta_target} ${test_target})
+    endforeach()
+  endforeach()
+endfunction()
+
+# Grab all algorithm subdirectories:
+set(test_dirs)
+file(GLOB contents
+  CONFIGURE_DEPENDS
+  "${CMAKE_CURRENT_LIST_DIR}/*"
+)
+
+foreach(test_dir IN LISTS contents)
+  if(IS_DIRECTORY "${test_dir}")
+    list(APPEND test_dirs "${test_dir}")
+  endif()
+endforeach()
+
+# Process all test dirs:
+foreach(test_dir IN LISTS test_dirs)
+  get_filename_component(algo_name "${test_dir}" NAME_WLE)
+  thrust_add_async_test_dir(${algo_name})
+endforeach()
diff --git a/testing/async/exclusive_scan/counting_iterator.cu b/testing/async/exclusive_scan/counting_iterator.cu
new file mode 100644
index 000000000..7771299dd
--- /dev/null
+++ b/testing/async/exclusive_scan/counting_iterator.cu
@@ -0,0 +1,46 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_from_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct test_counting_iterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/discard_output.cu b/testing/async/exclusive_scan/discard_output.cu
new file mode 100644
index 000000000..ec7ca5f47
--- /dev/null
+++ b/testing/async/exclusive_scan/discard_output.cu
@@ -0,0 +1,38 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct test_discard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_indices.cu b/testing/async/exclusive_scan/large_indices.cu
new file mode 100644
index 000000000..4d1c51df0
--- /dev/null
+++ b/testing/async/exclusive_scan/large_indices.cu
@@ -0,0 +1,244 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+//
+// This allows us to test very large problem sizes without actually allocating
+// large amounts of memory that would exceed most devices' capacity.
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type*;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category =
+    typename thrust::detail::iterator_facade_category<
+      thrust::device_system_tag,
+      thrust::random_access_traversal_tag,
+      value_type,
+      reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated. This needs to be
+  // done explicitly from the host.
+  void initialize_shared_state()
+  {
+    found_max        = thrust::device_malloc<bool>(1);
+    unexpected_value = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator. This needs to be
+  // done explicitly from the host.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__
+  assert_sequence_iterator clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator&& it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type()
+    {
+      iter.free_shared_state();
+    }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType&)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{0,
+                                // minus one bc exclusive scan:
+                                static_cast<value_type>(num_values - 1),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType& e,
+                              output_t const&,
+                              output_t const& test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<uint64_t>                // - initial_value
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{}, std::tuple<uint64_t>{0}};
+  }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // anon namespace
+
+void test_large_indices_default_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_default_scan_op);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple<     // List any extra arg overloads:
+    std::tuple<uint64_t, thrust::maximum<>> // - initial_value, binop
+  >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::make_tuple(0, thrust::maximum<>{})};
+  }
+};
+
+struct custom_bin_op_invoker
+  : testing::async::mixin::input::counting_iterator_from_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // namespace
+
+void test_large_indices_custom_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_custom_scan_op);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/large_types.cu b/testing/async/exclusive_scan/large_types.cu
new file mode 100644
index 000000000..571d39262
--- /dev/null
+++ b/testing/async/exclusive_scan/large_types.cu
@@ -0,0 +1,58 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct test_large_types
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+  }
+};
+DECLARE_UNITTEST(test_large_types);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixed_types.cu b/testing/async/exclusive_scan/mixed_types.cu
new file mode 100644
index 000000000..f69af1794
--- /dev/null
+++ b/testing/async/exclusive_scan/mixed_types.cu
@@ -0,0 +1,120 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - initial_value_type     | (int, float, <none>)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The initial_value_type and thrust::plus<T> types are covered by the
+// mixin::postfix_args::scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+template <typename value_type>
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<         // Overloads to test:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, thrust::plus<>>,     // - initial_value, plus<>
+    std::tuple<value_type, thrust::plus<int>>,  // - initial_value, plus<int>
+    std::tuple<value_type, thrust::plus<float>> // - initial_value, plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::tuple<>{},
+      std::make_tuple(static_cast<value_type>(5.5)),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<>{}),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<int>{}),
+      std::make_tuple(static_cast<value_type>(5.5), thrust::plus<float>{})};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type,
+          typename initial_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args<initial_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    // Use almost_equal instead of almost_equal_if_fp because floating point
+    // addition may be hidden in the scan_op (thrust::plus<float> is always
+    // tested).
+    , testing::async::mixin::compare_outputs::assert_almost_equal
+{
+  static std::string description()
+  {
+    return "mixed input/output/initial type tests";
+  }
+};
+
+} // namespace
+
+void test_scan_mixed_types(size_t num_values)
+{
+  // Since fp addition is non-associative, the results may be slightly off
+  // from the reference.
+  // This is primarily handled by using `compare_almost_equal` to do a fuzzy
+  // comparison. But for large enough test sizes, eventually the scan results
+  // will wrap for integral value_types. If a float accumulator is used, the
+  // small errors from non-associative addition may cause the wrap to happen in
+  // a different location, resulting in an error too large for almost_equal to
+  // ignore.
+  // This wrap seems to happen around 2^16 values, so skip when num_values is
+  // close to that.
+  if (num_values > ((1ll << 16) - 10))
+  {
+    return;
+  }
+
+  // invoker template params are input_value_type, output_vt, initial_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, int, float>>::run(num_values);
+  test_policy_overloads<invoker<int, float, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int, int>>::run(num_values);
+  test_policy_overloads<invoker<float, int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, float, int>>::run(num_values);
+  // We all float down here
+  test_policy_overloads<invoker<float, float, float>>::run(num_values);
+}
+DECLARE_SIZED_UNITTEST(test_scan_mixed_types);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/mixin.h b/testing/async/exclusive_scan/mixin.h
new file mode 100644
index 000000000..02ac9908f
--- /dev/null
+++ b/testing/async/exclusive_scan/mixin.h
@@ -0,0 +1,119 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace exclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple<         // List any extra arg overloads:
+    std::tuple<>,                               // - no extra args
+    std::tuple<value_type>,                     // - initial_value
+    std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{},
+                             std::make_tuple(value_type{42}),
+                             std::make_tuple(value_type{42},
+                                             alternate_binary_op{})};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::exclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct simple
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::exclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace exclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/simple.cu b/testing/async/exclusive_scan/simple.cu
new file mode 100644
index 000000000..8c55052d7
--- /dev/null
+++ b/testing/async/exclusive_scan/simple.cu
@@ -0,0 +1,72 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<simple_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple_in_place
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = simple_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/stateful_operator.cu b/testing/async/exclusive_scan/stateful_operator.cu
new file mode 100644
index 000000000..411ffbd99
--- /dev/null
+++ b/testing/async/exclusive_scan/stateful_operator.cu
@@ -0,0 +1,62 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<                   // Single overload:
+    std::tuple<value_type, stateful_operator<value_type>> // init_val, bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::make_tuple(value_type{42},
+                      stateful_operator<value_type>{value_type{2}})};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::exclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct test_stateful_operator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/exclusive_scan/using_vs_adl.cu b/testing/async/exclusive_scan/using_vs_adl.cu
new file mode 100644
index 000000000..34a80bd79
--- /dev/null
+++ b/testing/async/exclusive_scan/using_vs_adl.cu
@@ -0,0 +1,171 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/exclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      exclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::exclusive_scan;
+    thrust::device_event e =
+      exclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void test_using_namespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_namespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename initial_value_type  = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::exclusive_scan::mixin::postfix_args::
+        all_overloads<initial_value_type, alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::exclusive_scan`";
+  }
+};
+
+void test_using_cpo()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_cpo);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/counting_iterator.cu b/testing/async/inclusive_scan/counting_iterator.cu
new file mode 100644
index 000000000..fe9fdeb80
--- /dev/null
+++ b/testing/async/inclusive_scan/counting_iterator.cu
@@ -0,0 +1,45 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <algorithm>
+#include <limits>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : testing::async::mixin::input::counting_iterator_from_0<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "fancy input iterator (counting_iterator)";
+  }
+};
+
+template <typename T>
+struct test_counting_iterator
+{
+  void operator()(std::size_t num_values) const
+  {
+    num_values = unittest::truncate_to_max_representable<T>(num_values);
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+// Use built-in types only, counting_iterator doesn't seem to be compatible with
+// the custom_numeric.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_counting_iterator,
+                                          BuiltinNumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/discard_output.cu b/testing/async/inclusive_scan/discard_output.cu
new file mode 100644
index 000000000..c202de7f0
--- /dev/null
+++ b/testing/async/inclusive_scan/discard_output.cu
@@ -0,0 +1,37 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Compilation test with discard iterators. No runtime validation is actually
+// performed, other than testing whether the algorithm completes without
+// exception.
+
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct discard_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::discard_iterator
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::noop
+{
+  static std::string description() { return "discard output"; }
+};
+
+template <typename T>
+struct test_discard
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<discard_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_discard, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_indices.cu b/testing/async/inclusive_scan/large_indices.cu
new file mode 100644
index 000000000..4124cf96d
--- /dev/null
+++ b/testing/async/inclusive_scan/large_indices.cu
@@ -0,0 +1,239 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <thrust/device_free.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_ptr.h>
+#include <thrust/optional.h>
+
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/iterator_facade_category.h>
+
+#include <cstdint>
+
+// This test is an adaptation of TestInclusiveScanWithBigIndices from scan.cu.
+
+namespace
+{
+
+// Fake iterator that asserts
+// (a) it is written with a sequence and
+// (b) a defined maximum value is written at some point
+//
+// This allows us to test very large problem sizes without actually allocating
+// large amounts of memory that would exceed most devices' capacity.
+struct assert_sequence_iterator
+{
+  using value_type      = std::int64_t;
+  using difference_type = std::int64_t;
+
+  // Defined for thrust::iterator_traits:
+  using pointer           = value_type *;
+  using reference         = assert_sequence_iterator; // weird but convenient
+  using iterator_category = typename thrust::detail::iterator_facade_category<
+    thrust::device_system_tag,
+    thrust::random_access_traversal_tag,
+    value_type,
+    reference>::type;
+
+  std::int64_t expected{0};
+  std::int64_t max{0};
+  mutable thrust::device_ptr<bool> found_max{nullptr};
+  mutable thrust::device_ptr<bool> unexpected_value{nullptr};
+
+  // Should be called on the first iterator generated. This needs to be done
+  // explicitly from the host.
+  void initialize_shared_state()
+  {
+    found_max         = thrust::device_malloc<bool>(1);
+    unexpected_value  = thrust::device_malloc<bool>(1);
+    *found_max        = false;
+    *unexpected_value = false;
+  }
+
+  // Should be called only once on the initialized iterator. This needs to be
+  // done explicitly from the host.
+  void free_shared_state() const
+  {
+    thrust::device_free(found_max);
+    thrust::device_free(unexpected_value);
+    found_max        = nullptr;
+    unexpected_value = nullptr;
+  }
+
+  __host__ __device__ assert_sequence_iterator operator+(difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  __host__ __device__ reference operator[](difference_type i) const
+  {
+    return clone(expected + i);
+  }
+
+  // Some weirdness, this iterator acts like its own reference
+  __device__ assert_sequence_iterator operator=(value_type val)
+  {
+    if (val != expected)
+    {
+      printf("Error: expected %lld, got %lld\n", expected, val);
+
+      *unexpected_value = true;
+    }
+    else if (val == max)
+    {
+      *found_max = true;
+    }
+
+    return *this;
+  }
+
+private:
+  __host__ __device__ assert_sequence_iterator
+  clone(value_type new_expected) const
+  {
+    return {new_expected, max, found_max, unexpected_value};
+  }
+};
+
+// output mixin that generates assert_sequence_iterators.
+// Must be paired with validate_assert_sequence_iterators mixin to free
+// shared state.
+struct assert_sequence_output
+{
+  struct output_type
+  {
+    using iterator = assert_sequence_iterator;
+
+    iterator iter;
+
+    explicit output_type(iterator &&it)
+        : iter{std::move(it)}
+    {
+      iter.initialize_shared_state();
+    }
+
+    ~output_type() { iter.free_shared_state(); }
+
+    iterator begin() { return iter; }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values, InputType &)
+  {
+    using value_type = typename assert_sequence_iterator::value_type;
+    assert_sequence_iterator it{1,
+                                static_cast<value_type>(num_values),
+                                nullptr,
+                                nullptr};
+    return output_type{std::move(it)};
+  }
+};
+
+struct validate_assert_sequence_iterators
+{
+  using output_t = assert_sequence_output::output_type;
+
+  template <typename EventType>
+  static void compare_outputs(EventType &e,
+                              output_t const &,
+                              output_t const &test)
+  {
+    testing::async::mixin::compare_outputs::detail::basic_event_validation(e);
+
+    ASSERT_EQUAL(*test.iter.unexpected_value, false);
+    ASSERT_EQUAL(*test.iter.found_max, true);
+  }
+};
+
+//------------------------------------------------------------------------------
+// Overloads without custom binary operators use thrust::plus<>, so use
+// constant input iterator to generate the output sequence:
+struct default_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>                        // - no extra args
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return std::tuple<std::tuple<>>{};
+  }
+};
+
+struct default_bin_op_invoker
+    : testing::async::mixin::input::constant_iterator_1<std::int64_t>
+    , assert_sequence_output
+    , default_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with default binary operator";
+  }
+};
+
+} // end anon namespace
+
+void test_large_indices_default_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<default_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_default_scan_op);
+
+namespace
+{
+
+//------------------------------------------------------------------------------
+// Generate the output sequence using counting iterators and thrust::max<> for
+// custom operator overloads.
+struct custom_bin_op_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<thrust::maximum<>>       // - custom binary op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::make_tuple(thrust::maximum<>{})};
+  }
+};
+
+struct custom_bin_op_invoker
+    : testing::async::mixin::input::counting_iterator_from_1<std::int64_t>
+    , assert_sequence_output
+    , custom_bin_op_overloads
+    , testing::async::mixin::invoke_reference::noop
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , validate_assert_sequence_iterators
+{
+  static std::string description()
+  {
+    return "test large array indices with custom binary operator";
+  }
+};
+
+} // end anon namespace
+
+void test_large_indices_custom_scan_op()
+{
+  // Test problem sizes around signed/unsigned int max:
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 30);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 31);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 32);
+  testing::async::test_policy_overloads<custom_bin_op_invoker>::run(1ll << 33);
+}
+DECLARE_UNITTEST(test_large_indices_custom_scan_op);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/large_types.cu b/testing/async/inclusive_scan/large_types.cu
new file mode 100644
index 000000000..00bb8b461
--- /dev/null
+++ b/testing/async/inclusive_scan/large_types.cu
@@ -0,0 +1,58 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+#include <unittest/special_types.h>
+
+// This test is an adaptation of TestScanWithLargeTypes from scan.cu.
+
+// Need special initialization for the FixedVector type:
+template <typename value_type>
+struct device_vector_fill
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::fill(input.begin(), input.end(), value_type{2});
+    return input;
+  }
+};
+
+template <typename value_type, typename alternate_binary_op = thrust::maximum<>>
+struct invoker
+    : device_vector_fill<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "scan with large value types.";
+  }
+};
+
+struct test_large_types
+{
+  void operator()(std::size_t num_values) const
+  {
+    using testing::async::test_policy_overloads;
+
+    test_policy_overloads<invoker<FixedVector<int, 1>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 8>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 32>>>::run(num_values);
+    test_policy_overloads<invoker<FixedVector<int, 64>>>::run(num_values);
+  }
+};
+DECLARE_UNITTEST(test_large_types);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixed_types.cu b/testing/async/inclusive_scan/mixed_types.cu
new file mode 100644
index 000000000..57931c8d0
--- /dev/null
+++ b/testing/async/inclusive_scan/mixed_types.cu
@@ -0,0 +1,109 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Test using mixed int/float types for:
+// - input_value_type       | (int, float)
+// - output_value_type      | (int, float)
+// - thrust::plus<T> T-type | (int, float, void>)
+//
+// The thrust::plus<T> types are covered by the
+// scan_mixed_types_overloads component.
+//
+// The testing/scan.cu TestMixedTypes test spells out the expected behavior,
+// which is defined by https://wg21.link/P0571.
+
+namespace
+{
+
+template <typename value_type>
+struct mixed_type_input_generator
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     // fractional values are chosen deliberately to test
+                     // casting orders and accumulator types:
+                     static_cast<value_type>(1.5),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+// A fractional value is used to ensure that a different result is obtained when
+// using float vs. int.
+struct mixed_types_postfix_args
+{
+  using postfix_args_type = std::tuple<  // Overloads to test:
+    std::tuple<>,                        // - no extra args
+    std::tuple<thrust::plus<>>,          // - plus<>
+    std::tuple<thrust::plus<int>>,       // - plus<int>
+    std::tuple<thrust::plus<float>>      // - plus<float>
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{},
+                             std::make_tuple(thrust::plus<>{}),
+                             std::make_tuple(thrust::plus<int>{}),
+                             std::make_tuple(thrust::plus<float>{})};
+  }
+};
+
+template <typename input_value_type,
+          typename output_value_type>
+struct invoker
+    : mixed_type_input_generator<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , mixed_types_postfix_args
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    // Use almost_equal instead of almost_equal_if_fp because floating point
+    // addition may be hidden in the scan_op (thrust::plus<float> is always
+    // tested).
+    , testing::async::mixin::compare_outputs::assert_almost_equal
+{
+  static std::string description()
+  {
+    return "mixed input/output/functor value_type tests";
+  }
+};
+
+} // namespace
+
+void test_scan_mixed_types(size_t num_values)
+{
+  // Since fp addition is non-associative, the results may be slightly off
+  // from the reference.
+  // This is primarily handled by using `compare_almost_equal` to do a fuzzy
+  // comparison. But for large enough test sizes, eventually the scan results
+  // will wrap for integral value_types. If a float accumulator is used, the
+  // small errors from non-associative addition may cause the wrap to happen in
+  // a different location, resulting in an error too large for almost_equal to
+  // ignore.
+  // This wrap seems to happen around 2^16 values, so skip when num_values is
+  // close to that.
+  if (num_values > ((1ll << 16) - 10))
+  {
+    return;
+  }
+
+  // invoker template params are input_value_type, output_vt:
+  using testing::async::test_policy_overloads;
+  test_policy_overloads<invoker<int, int>>::run(num_values);
+  test_policy_overloads<invoker<int, float>>::run(num_values);
+  test_policy_overloads<invoker<float, int>>::run(num_values);
+  test_policy_overloads<invoker<float, float>>::run(num_values);
+}
+DECLARE_SIZED_UNITTEST(test_scan_mixed_types);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/mixin.h b/testing/async/inclusive_scan/mixin.h
new file mode 100644
index 000000000..82ecd59b8
--- /dev/null
+++ b/testing/async/inclusive_scan/mixin.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/async/scan.h>
+
+#include <thrust/scan.h>
+
+#include <async/mixin.h>
+
+namespace testing
+{
+namespace async
+{
+namespace inclusive_scan
+{
+
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+
+template <typename alternate_binary_op = thrust::maximum<>>
+struct all_overloads
+{
+  using postfix_args_type = std::tuple< // List any extra arg overloads:
+    std::tuple<>,                       // - no extra args
+    std::tuple<alternate_binary_op>     // - binary_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{std::tuple<>{}, std::make_tuple(alternate_binary_op{})};
+  }
+};
+
+} // namespace postfix_args
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    // Run host synchronous algorithm to generate reference.
+    thrust::inclusive_scan(host_input.cbegin(),
+                           host_input.cend(),
+                           host_output.begin(),
+                           std::get<PostfixArgIndices>(
+                             THRUST_FWD(postfix_tuple))...);
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+struct simple
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    auto e = thrust::async::inclusive_scan(
+      std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+      input.cbegin(),
+      input.cend(),
+      output.begin(),
+      std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+} // namespace mixin
+} // namespace inclusive_scan
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/simple.cu b/testing/async/inclusive_scan/simple.cu
new file mode 100644
index 000000000..1256f009b
--- /dev/null
+++ b/testing/async/inclusive_scan/simple.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::
+        host_synchronous<input_value_type, output_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<simple_invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple, NumericTypes);
+
+// Testing the in-place algorithm uses the exact same instantiations of the
+// underlying scan implementation as above. Test them here to avoid compiling
+// them twice.
+template <typename input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct simple_inplace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector_reuse_input<input_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        input_value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "simple in-place invocation with device vectors";
+  }
+};
+
+template <typename T>
+struct test_simple_in_place
+{
+  void operator()(std::size_t num_values) const
+  {
+    using invoker = simple_inplace_invoker<T>;
+    testing::async::test_policy_overloads<invoker>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_simple_in_place, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/stateful_operator.cu b/testing/async/inclusive_scan/stateful_operator.cu
new file mode 100644
index 000000000..224c29303
--- /dev/null
+++ b/testing/async/inclusive_scan/stateful_operator.cu
@@ -0,0 +1,61 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+namespace
+{
+
+// Custom binary operator for scan:
+template <typename T>
+struct stateful_operator
+{
+  T offset;
+
+  __host__ __device__ T operator()(T v1, T v2) { return v1 + v2 + offset; }
+};
+
+// Postfix args overload definition that uses a stateful custom binary operator
+template <typename value_type>
+struct use_stateful_operator
+{
+  using postfix_args_type = std::tuple<       // Single overload:
+    std::tuple<stateful_operator<value_type>> // bin_op
+    >;
+
+  static postfix_args_type generate_postfix_args()
+  {
+    return postfix_args_type{
+      std::make_tuple(stateful_operator<value_type>{value_type{2}})};
+  }
+};
+
+template <typename value_type>
+struct invoker
+    : testing::async::mixin::input::device_vector<value_type>
+    , testing::async::mixin::output::device_vector<value_type>
+    , use_stateful_operator<value_type>
+    , testing::async::inclusive_scan::mixin::invoke_reference::host_synchronous<
+        value_type>
+    , testing::async::inclusive_scan::mixin::invoke_async::simple
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description() { return "scan with stateful operator"; }
+};
+
+} // namespace
+
+template <typename T>
+struct test_stateful_operator
+{
+  void operator()(std::size_t num_values) const
+  {
+    testing::async::test_policy_overloads<invoker<T>>::run(num_values);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(test_stateful_operator, NumericTypes);
+
+#endif // C++14
diff --git a/testing/async/inclusive_scan/using_vs_adl.cu b/testing/async/inclusive_scan/using_vs_adl.cu
new file mode 100644
index 000000000..9789ce5c9
--- /dev/null
+++ b/testing/async/inclusive_scan/using_vs_adl.cu
@@ -0,0 +1,169 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <async/test_policy_overloads.h>
+
+#include <async/inclusive_scan/mixin.h>
+
+// Verify what happens when calling the algorithm without any namespace
+// qualifiers:
+// - If the async entry point is available in the global namespace due to a
+//   using statement, the async algorithm should be called.
+// - Otherwise, ADL should resolve the call to the synchronous algo in the
+//   thrust:: namespace.
+
+namespace invoke_reference
+{
+
+template <typename input_value_type,
+          typename output_value_type = input_value_type>
+struct adl_host_synchronous
+{
+  template <typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static void invoke_reference(InputType const& input,
+                               OutputType& output,
+                               PostfixArgTuple&& postfix_tuple,
+                               std::index_sequence<PostfixArgIndices...>)
+  {
+    // Create host versions of the input/output:
+    thrust::host_vector<input_value_type> host_input(input.cbegin(),
+                                                     input.cend());
+    thrust::host_vector<output_value_type> host_output(host_input.size());
+
+    using OutIter = thrust::remove_cvref_t<decltype(host_output.begin())>;
+
+    // ADL should resolve this to the synchronous `thrust::` algorithm.
+    // This is checked by ensuring that the call returns an output iterator.
+    OutIter result =
+      inclusive_scan(host_input.cbegin(),
+                     host_input.cend(),
+                     host_output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    (void)result;
+
+    // Copy back to device.
+    output = host_output;
+  }
+};
+
+} // namespace invoke_reference
+
+namespace invoke_async
+{
+
+struct using_namespace
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using namespace thrust::async;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+struct using_cpo
+{
+  template <typename PrefixArgTuple,
+            std::size_t... PrefixArgIndices,
+            typename InputType,
+            typename OutputType,
+            typename PostfixArgTuple,
+            std::size_t... PostfixArgIndices>
+  static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+                           std::index_sequence<PrefixArgIndices...>,
+                           InputType const& input,
+                           OutputType& output,
+                           PostfixArgTuple&& postfix_tuple,
+                           std::index_sequence<PostfixArgIndices...>)
+  {
+    // Importing the CPO into the current namespace should unambiguously resolve
+    // this call to the CPO, as opposed to resolving to the thrust:: algorithm
+    // via ADL. This is verified by checking that an event is returned.
+    using thrust::async::inclusive_scan;
+    thrust::device_event e =
+      inclusive_scan(std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+                     input.cbegin(),
+                     input.cend(),
+                     output.begin(),
+                     std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+    return e;
+  }
+};
+
+} // namespace invoke_async
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_namespace_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_namespace
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with `using namespace thrust::async`";
+  }
+};
+
+void test_using_namespace()
+{
+  using invoker = using_namespace_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_namespace);
+
+template <typename input_value_type,
+          typename output_value_type   = input_value_type,
+          typename alternate_binary_op = thrust::maximum<>>
+struct using_cpo_invoker
+    : testing::async::mixin::input::device_vector<input_value_type>
+    , testing::async::mixin::output::device_vector<output_value_type>
+    , testing::async::inclusive_scan::mixin::postfix_args::
+        all_overloads<alternate_binary_op>
+    , invoke_reference::adl_host_synchronous<input_value_type, output_value_type>
+    , invoke_async::using_cpo
+    , testing::async::mixin::compare_outputs::assert_almost_equal_if_fp_quiet
+{
+  static std::string description()
+  {
+    return "importing async CPO with "
+           "`using namespace thrust::async::inclusive_scan`";
+  }
+};
+
+void test_using_cpo()
+{
+  using invoker = using_cpo_invoker<int>;
+  testing::async::test_policy_overloads<invoker>::run(128);
+}
+DECLARE_UNITTEST(test_using_cpo);
+
+#endif // C++14
diff --git a/testing/async/mixin.h b/testing/async/mixin.h
new file mode 100644
index 000000000..6d1c06ed7
--- /dev/null
+++ b/testing/async/mixin.h
@@ -0,0 +1,663 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <thrust/type_traits/logical_metafunctions.h>
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <tuple>
+#include <type_traits>
+
+// clang-format off
+
+// This file contains a set of mix-in classes that define an algorithm
+// definition for use with test_policy_overloads<algo_def>. The algorithm
+// definition describes the details of a thrust::async algorithm invocation:
+//
+// - Input type and initialization
+// - Output type and initialization (supports in-place, too)
+// - Postfix arguments that define the algorithm's overload set
+// - Abstracted invocation of the async algorithm
+// - Abstracted invocation of a reference algorithm
+// - Validation of async vs. reference output
+// - A description string.
+//
+// This definition is used by test_policy_overloads to test each overload
+// against a reference while injecting a variety of execution policies. This
+// validates that each overload behaves correctly according to some reference.
+//
+// Since much of the algorithm definition is generic and may be reused in
+// multiple tests with slight changes, a mix-in system is used to simplify
+// the creation of algorithm definitions. The following namespace hierarchy is
+// used to organize these generic components:
+//
+// * testing::async::mixin::
+// ** ::input - Input types/values (device vectors, counting iterators, etc)
+// ** ::output - Output types/values (device vectors, inplace device vectors,
+//                                    discard iterators, etc)
+// ** ::postfix_args - Algorithm specific overload sets
+// ** ::invoke_reference - Algorithm specific reference invocation
+// ** ::invoke_async - Algorithm specific async algo invocation
+// ** ::compare_outputs - Compare output values.
+//
+// Each algorithm should define its own `mixins.h` header to declare algorithm
+// specific mixins (e.g. postfix_args, invoke_reference, and invoke_async)
+// in a testing::async::<algorithm_name>::mixins namespace structure.
+//
+// For example, the test.async.exclusive_scan.basic test uses the following
+// algorithm definition from mix-ins:
+//
+// ```
+//   #include <async/test_policy_overloads.h>
+//   #include <async/mixin.h>
+//   #include <async/exclusive_scan/mixin.h>
+//   template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//   struct basic_invoker
+//      : testing::async::mixin::input::device_vector<input_value_type>
+//      , testing::async::mixin::output::device_vector<output_value_type>
+//      , testing::async::exclusive_scan::mixin::postfix_args::
+//          all_overloads<initial_value_type, alternate_binary_op>
+//      , testing::async::exclusive_scan::mixin::invoke_reference::
+//          host_synchronous<input_value_type, output_value_type>
+//      , testing::async::exclusive_scan::mixin::invoke_async::basic
+//      , testing::async::mixin::compare_outputs::assert_equal_quiet
+//   {
+//     static std::string description()
+//     {
+//       return "basic invocation with device vectors";
+//     }
+//   };
+//
+//   ...
+//
+//   testing::async::test_policy_overloads<basic_invoker<T>>::run(num_values);
+// ```
+//
+// The basic_invoker class expands to something similar to the following:
+//
+// ```
+//  template <typename input_value_type,
+//            typename output_value_type   = input_value_type,
+//            typename initial_value_type  = input_value_type,
+//            typename alternate_binary_op = thrust::maximum<>>
+//  struct basic_invoker
+//  {
+//  public:
+//
+//    static std::string description()
+//    {
+//      return "basic invocation with device vectors";
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::input::device_vector
+//    //
+//    // input_type must provide idiomatic definitions of:
+//    // - `using iterator = ...;`
+//    // - `iterator begin() const { ... }`
+//    // - `iterator end() const { ... }`
+//    // - `size_t size() const { ... }`
+//    using input_type = thrust::device_vector<input_value_type>;
+//
+//    // Generate an instance of the input:
+//    static input_type generate_input(std::size_t num_values)
+//    {
+//      input_type input(num_values);
+//      thrust::sequence(input.begin(), input.end(), 25, 3);
+//      return input;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::output::device_vector
+//    //
+//    // output_type must provide idiomatic definitions of:
+//    // - `using iterator = ...;`
+//    // - `iterator begin() { ... }`
+//    using output_type = thrust::device_vector<output_value_type>;
+//
+//    // Generate an instance of the output:
+//    // Might be more complicated, eg. fancy iterators, etc
+//    static output_type generate_output(std::size_t num_values)
+//    {
+//      return output_type(num_values);
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::exclusive_scan::mixin::postfix_args::all_overloads
+//    using postfix_args_type = std::tuple<   // List any extra arg overloads:
+//      std::tuple<>,                                       // - no extra args
+//      std::tuple<initial_value_type>,                     // - initial_value
+//      std::tuple<initial_value_type, alternate_binary_op> // - initial_value, binary_op
+//      >;
+//
+//    // Create instances of the extra arguments to use when invoking the
+//    // algorithm:
+//    static postfix_args_type generate_postfix_args()
+//    {
+//      return postfix_args_type{
+//        std::tuple<>{},                            // no extra args
+//        std::make_tuple(initial_value_type{42}),   // initial_value
+//        // initial_value, binary_op:
+//        std::make_tuple(initial_value_Type{57}, alternate_binary_op{})
+//      };
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    //
+//    testing::async::exclusive_scan::mixin::invoke_reference::host_synchronous
+//    //
+//    // Invoke a reference implementation for a single overload as described by
+//    // postfix_tuple. This tuple contains instances of any trailing arguments
+//    // to pass to the algorithm. The tuple/index_sequence pattern is used to
+//    // support a "no extra args" overload, since the parameter pack expansion
+//    // will do exactly what we want in all cases.
+//    template <typename PostfixArgTuple, std::size_t... PostfixArgIndices>
+//    static void invoke_reference(input_type const &input,
+//                                 output_type &output,
+//                                 PostfixArgTuple &&postfix_tuple,
+//                                 std::index_sequence<PostfixArgIndices...>)
+//    {
+//      // Create host versions of the input/output:
+//      thrust::host_vector<input_value_type> host_input(input.cbegin(),
+//                                                       input.cend());
+//      thrust::host_vector<output_value_type> host_output(host_input.size());
+//
+//      // Run host synchronous algorithm to generate reference.
+//      thrust::exclusive_scan(host_input.cbegin(),
+//                             host_input.cend(),
+//                             host_output.begin(),
+//                             std::get<PostfixArgIndices>(
+//                               THRUST_FWD(postfix_tuple))...);
+//
+//      // Copy back to device.
+//      output = host_output;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::exclusive_scan::mixin::invoke_async::basic
+//    //
+//    // Invoke the async algorithm for a single overload as described by
+//    // the prefix and postfix tuples. These tuples contains instances of any
+//    // additional arguments to pass to the algorithm. The tuple/index_sequence
+//    // pattern is used to support the "no extra args" overload, since the
+//    // parameter pack expansion will do exactly what we want in all cases.
+//    // Prefix args are included here (but not for invoke_reference) to allow
+//    // the test framework to change the execution policy.
+//    // This method must return an event or future.
+//    template <typename PrefixArgTuple,
+//              std::size_t... PrefixArgIndices,
+//              typename PostfixArgTuple,
+//              std::size_t... PostfixArgIndices>
+//    static auto invoke_async(PrefixArgTuple &&prefix_tuple,
+//                             std::index_sequence<PrefixArgIndices...>,
+//                             input_type const &input,
+//                             output_type &output,
+//                             PostfixArgTuple &&postfix_tuple,
+//                             std::index_sequence<PostfixArgIndices...>)
+//    {
+//      output.resize(input.size());
+//      auto e = thrust::async::exclusive_scan(
+//        std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+//        input.cbegin(),
+//        input.cend(),
+//        output.begin(),
+//        std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+//      return e;
+//    }
+//
+//    //-------------------------------------------------------------------------
+//    // testing::async::mixin::compare_outputs::assert_equal_quiet
+//    //
+//    // Wait on and validate the event/future (usually with TEST_EVENT_WAIT /
+//    // TEST_FUTURE_VALUE_RETRIEVAL), then check that the reference output
+//    // matches the testing output.
+//    template <typename EventType>
+//    static void compare_outputs(EventType &e,
+//                                output_type const &ref,
+//                                output_type const &test)
+//    {
+//      TEST_EVENT_WAIT(e);
+//      ASSERT_EQUAL_QUIET(ref, test);
+//    }
+// };
+// ```
+//
+// Similar invokers with slight tweaks are used in other
+// async/exclusive_scan/*.cu tests.
+
+// clang-format on
+
+namespace testing
+{
+namespace async
+{
+namespace mixin
+{
+
+//------------------------------------------------------------------------------
+namespace input
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using input_type = thrust::device_vector<value_type>;
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    input_type input(num_values);
+    thrust::sequence(input.begin(),
+                     input.end(),
+                     static_cast<value_type>(1),
+                     static_cast<value_type>(1));
+    return input;
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_from_0
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(0)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(0)}; }
+
+    iterator end() const { return iterator{static_cast<value_type>(num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct counting_iterator_from_1
+{
+  struct input_type
+  {
+    using iterator = thrust::counting_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+    iterator cend() const { return iterator{static_cast<value_type>(1 + num_values)}; }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+template <typename value_type>
+struct constant_iterator_1
+{
+  struct input_type
+  {
+    using iterator = thrust::constant_iterator<value_type>;
+
+    std::size_t num_values;
+
+    iterator begin() const { return iterator{static_cast<value_type>(1)}; }
+    iterator cbegin() const { return iterator{static_cast<value_type>(1)}; }
+
+    iterator end() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+    iterator cend() const
+    {
+      return iterator{static_cast<value_type>(1)} + num_values;
+    }
+
+    std::size_t size() const { return num_values; }
+  };
+
+  static input_type generate_input(std::size_t num_values)
+  {
+    return {num_values};
+  }
+};
+
+} // namespace input
+
+//------------------------------------------------------------------------------
+namespace output
+{
+
+template <typename value_type>
+struct device_vector
+{
+  using output_type = thrust::device_vector<value_type>;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t num_values,
+                                     InputType& /* unused */)
+  {
+    return output_type(num_values);
+  }
+};
+
+template <typename value_type>
+struct device_vector_reuse_input
+{
+  using output_type = thrust::device_vector<value_type>&;
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /*num_values*/,
+                                     InputType& input)
+  {
+    return input;
+  }
+};
+
+struct discard_iterator
+{
+  struct output_type
+  {
+    using iterator = thrust::discard_iterator<>;
+
+    iterator begin() const { return thrust::make_discard_iterator(); }
+    iterator cbegin() const { return thrust::make_discard_iterator(); }
+  };
+
+  template <typename InputType>
+  static output_type generate_output(std::size_t /* num_values */,
+                                     InputType& /* input */)
+  {
+    return output_type{};
+  }
+};
+
+} // namespace output
+
+//------------------------------------------------------------------------------
+namespace postfix_args
+{
+/* Defined per algorithm. Example:
+ *
+ * // Defines several overloads:
+ * // algorithm([policy,] input, output) // no postfix args
+ * // algorithm([policy,] input, output, initial_value)
+ * // algorithm([policy,] input, output, initial_value, binary_op)
+ * template <typename value_type,
+ *           typename alternate_binary_op = thrust::maximum<>>
+ * struct all_overloads
+ * {
+ *   using postfix_args_type = std::tuple<     // List any extra arg overloads:
+ *     std::tuple<>,                               // - no extra args
+ *     std::tuple<value_type>,                     // - initial_value
+ *     std::tuple<value_type, alternate_binary_op> // - initial_value, binary_op
+ *     >;
+ *
+ *   static postfix_args_type generate_postfix_args()
+ *   {
+ *     return postfix_args_type{
+ *       std::tuple<>{},                            // no extra args
+ *       std::make_tuple(initial_value_type{42}),   // initial_value
+ *       // initial_value, binary_op:
+ *       std::make_tuple(initial_value_Type{57}, alternate_binary_op{})
+ *   }
+ * };
+ *
+ */
+}
+
+//------------------------------------------------------------------------------
+namespace invoke_reference
+{
+
+/* Defined per algorithm. Example:
+ *
+ * template <typename input_value_type,
+ *           typename output_value_type = input_value_type>
+ * struct host_synchronous
+ * {
+ *   template <typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static void invoke_reference(InputType const& input,
+ *                                OutputType& output,
+ *                                PostfixArgTuple&& postfix_tuple,
+ *                                std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     // Create host versions of the input/output:
+ *     thrust::host_vector<input_value_type> host_input(input.cbegin(),
+ *                                                      input.cend());
+ *     thrust::host_vector<output_value_type> host_output(host_input.size());
+ *
+ *     // Run host synchronous algorithm to generate reference.
+ *     // Be sure to call a backend that doesn't use the same underlying
+ *     // implementation.
+ *     thrust::exclusive_scan(host_input.cbegin(),
+ *                            host_input.cend(),
+ *                            host_output.begin(),
+ *                            std::get<PostfixArgIndices>(
+ *                              THRUST_FWD(postfix_tuple))...);
+ *
+ *     // Copy back to device.
+ *     output = host_output;
+ *   }
+ * };
+ *
+ */
+
+// Used to save time when testing unverifiable invocations (discard_iterators)
+struct noop
+{
+  template <typename... Ts>
+  static void invoke_reference(Ts&&...)
+  {}
+};
+
+} // namespace invoke_reference
+
+//------------------------------------------------------------------------------
+namespace invoke_async
+{
+
+/* Defined per algorithm. Example:
+ *
+ * struct basic
+ * {
+ *   template <typename PrefixArgTuple,
+ *             std::size_t... PrefixArgIndices,
+ *             typename InputType,
+ *             typename OutputType,
+ *             typename PostfixArgTuple,
+ *             std::size_t... PostfixArgIndices>
+ *   static auto invoke_async(PrefixArgTuple&& prefix_tuple,
+ *                            std::index_sequence<PrefixArgIndices...>,
+ *                            InputType const& input,
+ *                            OutputType& output,
+ *                            PostfixArgTuple&& postfix_tuple,
+ *                            std::index_sequence<PostfixArgIndices...>)
+ *   {
+ *     auto e = thrust::async::exclusive_scan(
+ *       std::get<PrefixArgIndices>(THRUST_FWD(prefix_tuple))...,
+ *       input.cbegin(),
+ *       input.cend(),
+ *       output.begin(),
+ *       std::get<PostfixArgIndices>(THRUST_FWD(postfix_tuple))...);
+ *     return e;
+ *   }
+ * };
+ */
+
+} // namespace invoke_async
+
+//------------------------------------------------------------------------------
+namespace compare_outputs
+{
+
+namespace detail
+{
+
+void basic_event_validation(thrust::device_event& e)
+{
+  TEST_EVENT_WAIT(e);
+}
+
+template <typename T>
+void basic_event_validation(thrust::device_future<T>& f)
+{
+  TEST_FUTURE_VALUE_RETRIEVAL(f);
+}
+
+} // namespace detail
+
+struct assert_equal
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+};
+
+struct assert_almost_equal
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types. Since fp
+// addition is non-associative, this is sometimes necessary.
+struct assert_almost_equal_if_fp
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+struct assert_equal_quiet
+{
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+};
+
+// Does an 'almost_equal' comparison for floating point types, since fp
+// addition is non-associative
+struct assert_almost_equal_if_fp_quiet
+{
+private:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::false_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_EQUAL_QUIET(ref, test);
+  }
+
+  template <typename EventType, typename OutputType>
+  static void compare_outputs_impl(EventType& e,
+                                   OutputType const& ref,
+                                   OutputType const& test,
+                                   std::true_type /* is_floating_point */)
+  {
+    detail::basic_event_validation(e);
+    ASSERT_ALMOST_EQUAL(ref, test);
+  }
+
+public:
+  template <typename EventType, typename OutputType>
+  static void compare_outputs(EventType& e,
+                              OutputType const& ref,
+                              OutputType const& test)
+  {
+    using value_type = typename OutputType::value_type;
+    compare_outputs_impl(e, ref, test, std::is_floating_point<value_type>{});
+  }
+};
+
+// Used to save time when testing unverifiable invocations (discard_iterators).
+// Just does basic validation of the future/event.
+struct noop
+{
+  template <typename EventType, typename... Ts>
+  static void compare_outputs(EventType &e, Ts&&...)
+  {
+    detail::basic_event_validation(e);
+  }
+};
+
+} // namespace compare_outputs
+
+} // namespace mixin
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async/test_policy_overloads.h b/testing/async/test_policy_overloads.h
new file mode 100644
index 000000000..b7bf1ab94
--- /dev/null
+++ b/testing/async/test_policy_overloads.h
@@ -0,0 +1,410 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/device_allocator.h>
+#include <thrust/future.h>
+
+#include <unittest/unittest.h>
+
+#include <string>
+
+// TODO Cover these cases from testing/async_reduce.cu:
+//   - [x] test_async_reduce_after ("after_future" in test_policy_overloads)
+//   - [ ] test_async_reduce_on_then_after (KNOWN_FAILURE, see #1195)
+//     - [ ] all the child variants (e.g. with allocator) too
+//   - [ ] test_async_copy_then_reduce (Need to figure out how to fit this in)
+//   - [ ] test_async_reduce_caching (only useful when returning future)
+
+namespace testing
+{
+
+namespace async
+{
+
+// Tests that policies are handled correctly for all overloads of an async
+// algorithm.
+//
+// The AlgoDef parameter type defines an async algorithm, its overloads, and
+// abstracts its invocation. See the async/mixins.h for a documented example of
+// this interface and some convenience mixins that can be used to construct a
+// definition quickly.
+//
+// The AlgoDef interface is used to run several tests of the algorithm,
+// exhaustively testing all overloads for algorithm correctness and proper
+// policy handling.
+//
+// ## Basic tests
+//
+// In the basic tests, each overload is called repeatedly with:
+// 1) No policy
+// 2) thrust::device
+// 3) thrust::device(thrust::device_allocator<void>)
+// 4) thrust::device.on(stream)
+// 5) thrust::device(thrust::device_allocator<void>).on(stream)
+//
+// The output of the async algorithm is compared against a reference output,
+// and the returned event/future is tested to make sure it holds a reference to
+// the expected stream.
+//
+// ## After Future tests
+//
+// The after_future tests check that the future/event returned from an algorithm
+// behaves properly when consumed by a policy's `.after` method.
+template <typename AlgoDef>
+struct test_policy_overloads
+{
+  using algo_def          = AlgoDef;
+  using input_type        = typename algo_def::input_type;
+  using output_type       = typename algo_def::output_type;
+  using postfix_args_type = typename algo_def::postfix_args_type;
+
+  static constexpr std::size_t num_postfix_arg_sets =
+    std::tuple_size<postfix_args_type>::value;
+
+  // Main entry point; call this from a unit test function.
+  static void run(std::size_t num_values)
+  {
+    test_postfix_overloads(num_values);
+  }
+
+private:
+  template <std::size_t Size>
+  using size_const = std::integral_constant<std::size_t, Size>;
+
+  //----------------------------------------------------------------------------
+  // Recursively call sub tests for each overload set in postfix_args:
+  template <std::size_t PostfixIdx = 0>
+  static void test_postfix_overloads(std::size_t const num_values,
+                                     size_const<PostfixIdx> = {})
+  {
+    static_assert(PostfixIdx < num_postfix_arg_sets, "Internal error.");
+
+    run_basic_policy_tests<PostfixIdx>(num_values);
+    run_after_future_tests<PostfixIdx>(num_values);
+
+    // Recurse to test next round of overloads:
+    test_postfix_overloads(num_values, size_const<PostfixIdx + 1>{});
+  }
+
+  static void test_postfix_overloads(std::size_t const,
+                                     size_const<num_postfix_arg_sets>)
+  {
+    // terminal case, no-op
+  }
+
+  //----------------------------------------------------------------------------
+  // For the specified postfix overload set, test the algorithm with several
+  // different policy configurations.
+  template <std::size_t PostfixIdx>
+  static void run_basic_policy_tests(std::size_t const num_values)
+  {
+    // When a policy uses the default stream, the algorithm implementation
+    // should spawn a new stream in the returned event:
+    auto using_default_stream = [](auto& e) {
+      ASSERT_NOT_EQUAL(thrust::cuda_cub::default_stream(),
+                       e.stream().native_handle());
+    };
+
+    // When a policy uses a non-default stream, the implementation should pass
+    // the stream through to the output:
+    thrust::system::cuda::detail::unique_stream test_stream{};
+    auto using_test_stream = [&test_stream](auto& e) {
+      ASSERT_EQUAL(test_stream.native_handle(), e.stream().native_handle());
+    };
+
+    // Test the different types of policies:
+    basic_policy_test<PostfixIdx>("(no policy)",
+                                   std::make_tuple(),
+                                   using_default_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>("thrust::device",
+                                   std::make_tuple(thrust::device),
+                                   using_default_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>(
+      "thrust::device(thrust::device_allocator<void>{})",
+      std::make_tuple(thrust::device(thrust::device_allocator<void>{})),
+      using_default_stream,
+      num_values);
+
+    basic_policy_test<PostfixIdx>("thrust::device.on(test_stream.get())",
+                                   std::make_tuple(
+                                     thrust::device.on(test_stream.get())),
+                                   using_test_stream,
+                                   num_values);
+
+    basic_policy_test<PostfixIdx>(
+      "thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())",
+      std::make_tuple(
+        thrust::device(thrust::device_allocator<void>{}).on(test_stream.get())),
+      using_test_stream,
+      num_values);
+  }
+
+  // Invoke the algorithm multiple times with the provided policy and validate
+  // the results.
+  template <std::size_t PostfixIdx,
+            typename PrefixArgTuple,
+            typename ValidateEvent>
+  static void basic_policy_test(std::string const &policy_desc,
+                                PrefixArgTuple &&prefix_tuple_ref,
+                                ValidateEvent const &validate,
+                                std::size_t num_values)
+  try
+  {
+    // Sink the prefix tuple into a const local so it can be safely passed to
+    // multiple invocations without worrying about potential modifications.
+    using prefix_tuple_type = thrust::remove_cvref_t<PrefixArgTuple>;
+    prefix_tuple_type const prefix_tuple = THRUST_FWD(prefix_tuple_ref);
+
+    using postfix_tuple_type =
+      std::tuple_element_t<PostfixIdx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<PostfixIdx>();
+
+    // Generate index sequences for the tuples:
+    constexpr auto prefix_tuple_size  = std::tuple_size<prefix_tuple_type>{};
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<prefix_tuple_size>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_d   = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_d   = algo_def::generate_output(num_values, input_d);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    // Invoke multiple overlapping async algorithms, capturing their outputs
+    // and events/futures:
+    auto e_a = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_b = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_b,
+                                      output_b,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_c = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_c,
+                                      output_c,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    auto e_d = algo_def::invoke_async(prefix_tuple,
+                                      prefix_index_seq{},
+                                      input_d,
+                                      output_d,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    // These wait on the e_X events:
+    algo_def::compare_outputs(e_a, output_ref, output_a);
+    algo_def::compare_outputs(e_b, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+    algo_def::compare_outputs(e_d, output_ref, output_d);
+
+    validate(e_a);
+    validate(e_b);
+    validate(e_c);
+    validate(e_d);
+  }
+  catch (unittest::UnitTestException &exc)
+  {
+    // Append some identifying information to the exception to help with
+    // debugging:
+    using overload_t = std::tuple_element_t<PostfixIdx, postfix_args_type>;
+
+    std::string const overload_desc =
+      unittest::demangle(typeid(overload_t).name());
+    std::string const input_desc =
+      unittest::demangle(typeid(input_type).name());
+    std::string const output_desc =
+      unittest::demangle(typeid(output_type).name());
+
+    exc << "\n"
+        << " - algo_def::description = " << algo_def::description() << "\n"
+        << " - test = basic_policy\n"
+        << " - policy = " << policy_desc << "\n"
+        << " - input_type = " << input_desc << "\n"
+        << " - output_type = " << output_desc << "\n"
+        << " - tuple of trailing arguments = " << overload_desc << "\n"
+        << " - num_values = " << num_values;
+    throw;
+  }
+
+  //----------------------------------------------------------------------------
+  // Test .after(event/future) handling:
+  template <std::size_t PostfixIdx>
+  static void run_after_future_tests(std::size_t const num_values)
+  try
+  {
+    using postfix_tuple_type =
+    std::tuple_element_t<PostfixIdx, postfix_args_type>;
+    postfix_tuple_type const postfix_tuple = get_postfix_tuple<PostfixIdx>();
+
+    // Generate index sequences for the tuples. Prefix size always = 1 here,
+    // since the async algorithms are always invoked with a single prefix
+    // arg (the execution policy) here.
+    constexpr auto postfix_tuple_size = std::tuple_size<postfix_tuple_type>{};
+    using prefix_index_seq  = std::make_index_sequence<1>;
+    using postfix_index_seq = std::make_index_sequence<postfix_tuple_size>;
+
+    // Use unique, non-const inputs for each invocation to support in-place
+    // algo_def configurations.
+    input_type input_a   = algo_def::generate_input(num_values);
+    input_type input_b   = algo_def::generate_input(num_values);
+    input_type input_c   = algo_def::generate_input(num_values);
+    input_type input_tmp = algo_def::generate_input(num_values);
+    input_type input_ref = algo_def::generate_input(num_values);
+
+    output_type output_a   = algo_def::generate_output(num_values, input_a);
+    output_type output_b   = algo_def::generate_output(num_values, input_b);
+    output_type output_c   = algo_def::generate_output(num_values, input_c);
+    output_type output_tmp = algo_def::generate_output(num_values, input_tmp);
+    output_type output_ref = algo_def::generate_output(num_values, input_ref);
+
+    auto e_a = algo_def::invoke_async(std::make_tuple(thrust::device),
+                                      prefix_index_seq{},
+                                      input_a,
+                                      output_a,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+    ASSERT_EQUAL(true, e_a.valid_stream());
+    auto const stream_a = e_a.stream().native_handle();
+
+    // Execution on default stream should create a new stream in the result:
+    ASSERT_NOT_EQUAL_QUIET(thrust::cuda_cub::default_stream(), stream_a);
+
+    //--------------------------------------------------------------------------
+    // Test event consumption when the event is an rvalue.
+    //--------------------------------------------------------------------------
+    // Using `forward_as_tuple` instead of `make_tuple` to explicitly control
+    // value categories.
+    // Explicitly order this invocation after e_a:
+    auto e_b =
+      algo_def::invoke_async(std::forward_as_tuple(thrust::device.after(e_a)),
+                             prefix_index_seq{},
+                             input_b,
+                             output_b,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_b.valid_stream());
+    auto const stream_b = e_b.stream().native_handle();
+
+    // Second invocation should use same stream as before:
+    ASSERT_EQUAL_QUIET(stream_a, stream_b);
+
+    // Verify that double consumption of e_a produces an exception:
+    ASSERT_THROWS_EQUAL(auto x = algo_def::invoke_async(
+                          std::forward_as_tuple(thrust::device.after(e_a)),
+                          prefix_index_seq{},
+                          input_tmp,
+                          output_tmp,
+                          postfix_tuple,
+                          postfix_index_seq{});
+                        THRUST_UNUSED_VAR(x),
+                        thrust::event_error,
+                        thrust::event_error(thrust::event_errc::no_state));
+
+    //--------------------------------------------------------------------------
+    // Test event consumption when the event is an lvalue
+    //--------------------------------------------------------------------------
+    // Explicitly order this invocation after e_b:
+    auto policy_after_e_b = thrust::device.after(e_b);
+    auto policy_after_e_b_tuple = std::forward_as_tuple(policy_after_e_b);
+    auto e_c =
+      algo_def::invoke_async(policy_after_e_b_tuple,
+                             prefix_index_seq{},
+                             input_c,
+                             output_c,
+                             postfix_tuple,
+                             postfix_index_seq{});
+    ASSERT_EQUAL(true, e_c.valid_stream());
+    auto const stream_c = e_c.stream().native_handle();
+
+    // Should use same stream as e_b:
+    ASSERT_EQUAL_QUIET(stream_b, stream_c);
+
+    // Verify that double consumption of e_b produces an exception:
+    ASSERT_THROWS_EQUAL(
+      auto x = algo_def::invoke_async(policy_after_e_b_tuple,
+                                      prefix_index_seq{},
+                                      input_tmp,
+                                      output_tmp,
+                                      postfix_tuple,
+                                      postfix_index_seq{});
+      THRUST_UNUSED_VAR(x),
+      thrust::event_error,
+      thrust::event_error(thrust::event_errc::no_state));
+
+    // Let reference calc overlap with async testing:
+    algo_def::invoke_reference(input_ref,
+                               output_ref,
+                               postfix_tuple,
+                               postfix_index_seq{});
+
+    // Validate results
+    // Use e_c for all three checks -- e_a and e_b will not pass the event
+    // checks since their streams were stolen by dependencies.
+    algo_def::compare_outputs(e_c, output_ref, output_a);
+    algo_def::compare_outputs(e_c, output_ref, output_b);
+    algo_def::compare_outputs(e_c, output_ref, output_c);
+  }
+  catch (unittest::UnitTestException &exc)
+  {
+    // Append some identifying information to the exception to help with
+    // debugging:
+    using postfix_t = std::tuple_element_t<PostfixIdx, postfix_args_type>;
+
+    std::string const postfix_desc =
+      unittest::demangle(typeid(postfix_t).name());
+    std::string const input_desc =
+      unittest::demangle(typeid(input_type).name());
+    std::string const output_desc =
+      unittest::demangle(typeid(output_type).name());
+
+    exc << "\n"
+        << " - algo_def::description = " << algo_def::description() << "\n"
+        << " - test = after_future\n"
+        << " - input_type = " << input_desc << "\n"
+        << " - output_type = " << output_desc << "\n"
+        << " - tuple of trailing arguments = " << postfix_desc << "\n"
+        << " - num_values = " << num_values;
+    throw;
+  }
+
+  //----------------------------------------------------------------------------
+  // Various helper functions:
+  template <std::size_t PostfixIdx>
+  static auto get_postfix_tuple()
+  {
+    return std::get<PostfixIdx>(algo_def::generate_postfix_args());
+  }
+};
+
+} // namespace async
+} // namespace testing
+
+#endif // C++14
diff --git a/testing/async_copy.cu b/testing/async_copy.cu
new file mode 100644
index 000000000..2666a6c38
--- /dev/null
+++ b/testing/async_copy.cu
@@ -0,0 +1,425 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/limits.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#define DEFINE_ASYNC_COPY_CALLABLE(name, ...)                                 \
+  struct THRUST_PP_CAT2(name, _fn)                                            \
+  {                                                                           \
+    template <typename ForwardIt, typename Sentinel, typename OutputIt>       \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    ) const                                                                   \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::copy(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(output)               \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy
+);
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host,   thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device, thrust::device
+);
+
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host_to_device,    thrust::host,   thrust::device
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device_to_host,    thrust::device, thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_host_to_host,      thrust::host,   thrust::host
+);
+DEFINE_ASYNC_COPY_CALLABLE(
+  invoke_async_copy_device_to_device,  thrust::device, thrust::device
+);
+
+#undef DEFINE_ASYNC_COPY_CALLABLE
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_host_to_device
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0(n);
+
+      auto f0 = AsyncCopyCallable{}(
+        h0.begin(), h0.end(), d0.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_host_to_device<invoke_async_copy_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_host_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_host_to_device<invoke_async_copy_host_to_device_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_host_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_device_to_host
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::host_vector<T>   h1(n);
+      thrust::device_vector<T> d0(n);
+
+      thrust::copy(h0.begin(), h0.end(), d0.begin());
+
+      ASSERT_EQUAL(h0, d0);
+
+      auto f0 = AsyncCopyCallable{}(
+        d0.begin(), d0.end(), h1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+      ASSERT_EQUAL(d0, h1);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_host<invoke_async_copy_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_device_to_host
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_host<invoke_async_copy_device_to_host_fn>::tester
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_device_to_host_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncCopyCallable>
+struct test_async_copy_device_to_device
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0(n);
+      thrust::device_vector<T> d1(n);
+
+      thrust::copy(h0.begin(), h0.end(), d0.begin());
+
+      ASSERT_EQUAL(h0, d0);
+
+      auto f0 = AsyncCopyCallable{}(
+        d0.begin(), d0.end(), d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0, d0);
+      ASSERT_EQUAL(d0, d1);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_device_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_device_to_device<invoke_async_copy_device_to_device_fn>::tester
+, NumericTypes
+, test_async_copy_device_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Non ContiguousIterator input.
+template <typename AsyncCopyCallable>
+struct test_async_copy_counting_iterator_input_to_device_vector
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(
+        unittest::truncate_to_max_representable<T>(n)
+      );
+
+      thrust::device_vector<T> d0(n);
+      thrust::device_vector<T> d1(n);
+
+      thrust::copy(first, last, d0.begin());
+
+      auto f0 = AsyncCopyCallable{}(
+        first, last, d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(d0, d1);
+    }
+  };
+};
+// TODO: Re-add custom_numeric when it supports counting iterators.
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_device_to_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_device_policies
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_device_vector<
+    invoke_async_copy_host_to_device_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_host_to_device_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Non ContiguousIterator input.
+template <typename AsyncCopyCallable>
+struct test_async_copy_counting_iterator_input_to_host_vector
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(
+        unittest::truncate_to_max_representable<T>(n)
+      );
+
+      thrust::host_vector<T> d0(n);
+      thrust::host_vector<T> d1(n);
+
+      thrust::copy(first, last, d0.begin());
+
+      auto f0 = AsyncCopyCallable{}(
+        first, last, d1.begin()
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(d0, d1);
+
+      #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL)
+      // ICC fails this for some unknown reason - see #1468.
+      KNOWN_FAILURE;
+      #endif
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_host_vector<
+    invoke_async_copy_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_host
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_counting_iterator_input_to_host_vector<
+    invoke_async_copy_device_to_host_fn
+  >::tester
+, BuiltinNumericTypes
+, test_async_copy_counting_iterator_input_trivially_relocatable_elements_device_to_host_policies
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_roundtrip
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(n);
+
+    auto e0 = thrust::async::copy(
+      thrust::host, thrust::device
+    , h0.begin(), h0.end(), d0.begin()
+    );
+
+    auto e1 = thrust::async::copy(
+      thrust::device.after(e0), thrust::host
+    , d0.begin(), d0.end(), h0.begin()
+    );
+
+    TEST_EVENT_WAIT(e1);
+
+    ASSERT_EQUAL(h0, d0);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  test_async_copy_roundtrip
+, BuiltinNumericTypes
+, test_async_copy_trivially_relocatable_elements_roundtrip
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h1(n);
+    thrust::device_vector<T> d0(n);
+    thrust::device_vector<T> d1(n);
+    thrust::device_vector<T> d2(n);
+
+    auto e0 = thrust::async::copy(
+      h0.begin(), h0.end(), d0.begin()
+    );
+
+    ASSERT_EQUAL(true, e0.valid_stream());
+
+    auto const e0_stream = e0.stream().native_handle();
+
+    auto e1 = thrust::async::copy(
+      thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::device.after(e0), d0.begin(), d0.end(), d1.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(e1);
+
+    auto e2 = thrust::async::copy(
+      thrust::host, after_policy2
+    , h0.begin(), h0.end(), d2.begin()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::copy(
+        thrust::host, after_policy2
+      , h0.begin(), h0.end(), d2.begin()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e2.stream().native_handle());
+
+    auto e3 = thrust::async::copy(
+      thrust::device.after(e2), thrust::host
+    , d1.begin(), d1.end(), h1.begin()
+    );
+
+    ASSERT_EQUAL_QUIET(e0_stream, e3.stream().native_handle());
+
+    TEST_EVENT_WAIT(e3);
+
+    ASSERT_EQUAL(h0, h1);
+    ASSERT_EQUAL(h0, d0);
+    ASSERT_EQUAL(h0, d1);
+    ASSERT_EQUAL(h0, d2);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_copy_after
+, BuiltinNumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: device_to_device NonContiguousIterator output (discard_iterator).
+
+// TODO: host_to_device non trivially relocatable.
+
+// TODO: device_to_host non trivially relocatable.
+
+// TODO: host_to_device NonContiguousIterator input (counting_iterator).
+
+// TODO: host_to_device NonContiguousIterator output (discard_iterator).
+
+// TODO: device_to_host NonContiguousIterator input (counting_iterator).
+
+// TODO: device_to_host NonContiguousIterator output (discard_iterator).
+
+// TODO: Mixed types, needs loosening of `is_trivially_relocatable_to` logic.
+
+// TODO: H->D copy, then dependent D->H copy (round trip).
+// Can't do this today because we can't do cross-system with explicit policies.
+
+#endif
+
diff --git a/testing/async_for_each.cu b/testing/async_for_each.cu
new file mode 100644
index 000000000..a09adf255
--- /dev/null
+++ b/testing/async_for_each.cu
@@ -0,0 +1,99 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+
+#include <thrust/async/for_each.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+#define DEFINE_ASYNC_FOR_EACH_CALLABLE(name, ...)                             \
+  struct THRUST_PP_CAT2(name, _fn)                                            \
+  {                                                                           \
+    template <typename ForwardIt, typename Sentinel, typename UnaryFunction>  \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, UnaryFunction&& f                   \
+    ) const                                                                   \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::for_each(                                              \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(f)                    \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_FOR_EACH_CALLABLE(
+  invoke_async_for_each
+);
+
+DEFINE_ASYNC_FOR_EACH_CALLABLE(
+  invoke_async_for_each_device, thrust::device
+);
+
+#undef DEFINE_ASYNC_FOR_EACH_CALLABLE
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct inplace_divide_by_2
+{
+  template <typename T>
+  __host__ __device__
+  void operator()(T& x) const
+  {
+    x /= 2;
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename AsyncForEachCallable, typename UnaryFunction>
+struct test_async_for_each
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0_data(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0_data(h0_data);
+
+      thrust::for_each(h0_data.begin(), h0_data.end(), UnaryFunction{});
+
+      auto f0 = AsyncForEachCallable{}(
+        d0_data.begin(), d0_data.end(), UnaryFunction{}
+      );
+
+      f0.wait();
+
+      ASSERT_EQUAL(h0_data, d0_data);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_for_each<
+      invoke_async_for_each_fn
+    , inplace_divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_for_each
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_for_each<
+      invoke_async_for_each_device_fn
+    , inplace_divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_for_each_policy
+);
+
+#endif
+
diff --git a/testing/async_reduce.cmake b/testing/async_reduce.cmake
new file mode 100644
index 000000000..44c0fbda1
--- /dev/null
+++ b/testing/async_reduce.cmake
@@ -0,0 +1,4 @@
+# Disable unreachable code warnings.
+# This test unconditionally throws in some places, the compiler will detect that
+# control flow will never reach some instructions. This is intentional.
+target_link_libraries(${test_target} PRIVATE thrust.silence_unreachable_code_warnings)
diff --git a/testing/async_reduce.cu b/testing/async_reduce.cu
new file mode 100644
index 000000000..c033c2311
--- /dev/null
+++ b/testing/async_reduce.cu
@@ -0,0 +1,1139 @@
+#define THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/reduce.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+template <typename T>
+struct custom_plus
+{
+  __host__ __device__
+  T operator()(T lhs, T rhs) const
+  {
+    return lhs + rhs;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(                                 \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::reduce(                                                \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_REDUCE_INVOKER(NAME, ...)                                \
+  DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(                                       \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_REDUCE_INVOKER(NAME, ...)                                 \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::reduce(                                                       \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_init_custom_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_init_custom_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INVOKER(
+  reduce_async_invoker_device_allocator_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceInvoker
+, template <typename> class SyncReduceInvoker
+>
+struct test_async_reduce
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      AsyncReduceInvoker<T> invoke_async;
+      SyncReduceInvoker<T>  invoke_sync;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end());
+      auto f0b = invoke_async(d0b.begin(), d0b.end());
+      auto f0c = invoke_async(d0c.begin(), d0c.end());
+      auto f0d = invoke_async(d0d.begin(), d0d.end());
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(h0.begin(), h0.end());
+
+      auto const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+      auto const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+      auto const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f0c);
+      auto const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f0d);
+
+      ASSERT_EQUAL(r0, r1a);
+      ASSERT_EQUAL(r0, r1b);
+      ASSERT_EQUAL(r0, r1c);
+      ASSERT_EQUAL(r0, r1d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_on_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce<
+      reduce_async_invoker_device_allocator_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_policy_allocator_on_init_custom_plus
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceInvoker
+, template <typename> class SyncReduceInvoker
+>
+struct test_async_reduce_counting_iterator
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()()
+    {
+      constexpr std::size_t n = 15 * sizeof(T);
+
+      ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(n);
+
+      AsyncReduceInvoker<T> invoke_async;
+      SyncReduceInvoker<T>  invoke_sync;
+
+      auto f0a = invoke_async(first, last);
+      auto f0b = invoke_async(first, last);
+      auto f0c = invoke_async(first, last);
+      auto f0d = invoke_async(first, last);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(first, last);
+
+      auto const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+      auto const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+      auto const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f0c);
+      auto const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f0d);
+
+      ASSERT_EQUAL(r0, r1a);
+      ASSERT_EQUAL(r0, r1b);
+      ASSERT_EQUAL(r0, r1c);
+      ASSERT_EQUAL(r0, r1d);
+    }
+  };
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_counting_iterator_init_custom_plus
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_counting_iterator<
+      reduce_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_reduce_policy_counting_iterator_init_custom_plus
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_using
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0a(h0);
+    thrust::device_vector<T> d0b(h0);
+
+    ASSERT_EQUAL(h0, d0a);
+    ASSERT_EQUAL(h0, d0b);
+
+    thrust::device_future<T> f0a;
+    thrust::device_future<T> f0b;
+
+    // When you import the customization points into the global namespace,
+    // they should be selected instead of the synchronous algorithms.
+    {
+      using namespace thrust::async;
+      f0a = reduce(d0a.begin(), d0a.end());
+    }
+    {
+      using thrust::async::reduce;
+      f0b = reduce(d0b.begin(), d0b.end());
+    }
+
+    // ADL should find the synchronous algorithms.
+    // This potentially runs concurrently with the copies.
+    T const r0 = reduce(h0.begin(), h0.end());
+
+    T const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f0a);
+    T const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f0b);
+
+    ASSERT_EQUAL(r0, r1a);
+    ASSERT_EQUAL(r0, r1b);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_using
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    auto f0 = thrust::async::reduce(
+      d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL(true, f0.valid_stream());
+ 
+    auto const f0_stream = f0.stream().native_handle();
+
+    auto f1 = thrust::async::reduce(
+      thrust::device.after(f0), d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device.after(f0), d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(f0_stream, f1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(f1);
+
+    auto f2 = thrust::async::reduce(
+      after_policy2, d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        after_policy2, d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(f0_stream, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_on_then_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    cudaStream_t stream;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)
+    );
+
+    auto f0 = thrust::async::reduce(
+      thrust::device.on(stream), d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f0.stream().native_handle());
+
+    auto f1 = thrust::async::reduce(
+      thrust::device.after(f0), d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a future produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device.after(f0), d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f1.stream().native_handle());
+
+    auto after_policy2 = thrust::device.after(f1);
+
+    auto f2 = thrust::async::reduce(
+      after_policy2, d0.begin(), d0.end()
+    );
+
+    // Verify that double consumption of a policy produces an exception.
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        after_policy2, d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamDestroy(stream)
+    );
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_on_then_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_allocator_on_then_after
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    cudaStream_t stream0;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream0, cudaStreamNonBlocking)
+    );
+
+    cudaStream_t stream1;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking)
+    );
+
+    auto f0 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).on(stream0)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_EQUAL_QUIET(stream0, f0.stream().native_handle());
+
+    auto f1 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).after(f0)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device(thrust::device_allocator<void>{}).after(f0)
+      , d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_EQUAL_QUIET(stream0, f1.stream().native_handle());
+
+    auto f2 = thrust::async::reduce(
+      thrust::device(thrust::device_allocator<void>{}).on(stream1).after(f1)
+    , d0.begin(), d0.end()
+    );
+
+    ASSERT_THROWS_EQUAL(
+      auto x = thrust::async::reduce(
+        thrust::device(thrust::device_allocator<void>{}).on(stream1).after(f1)
+      , d0.begin(), d0.end()
+      );
+      THRUST_UNUSED_VAR(x)
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    KNOWN_FAILURE;
+    // FIXME: The below fails because you can't combine allocator attachment,
+    // `.on`, and `.after`.
+    // The `#if 0` can be removed once the KNOWN_FAILURE is resolved.
+#if 0
+    ASSERT_EQUAL_QUIET(stream1, f2.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+    T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f2);
+
+    ASSERT_EQUAL(r0, r1);
+
+    thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream0));
+    thrust::cuda_cub::throw_on_error(cudaStreamDestroy(stream1));
+#endif
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_allocator_on_then_after
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_reduce_caching
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    constexpr std::int64_t m = 32;
+
+    thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+    thrust::device_vector<T> d0(h0);
+
+    ASSERT_EQUAL(h0, d0);
+
+    T const* f0_raw_data;
+
+    {
+      // Perform one reduction to ensure there's an entry in the caching
+      // allocator.
+      auto f0 = thrust::async::reduce(d0.begin(), d0.end());
+
+      TEST_EVENT_WAIT(f0);
+
+      f0_raw_data = f0.raw_data();
+    }
+
+    for (std::int64_t i = 0; i < m; ++i)
+    {
+      auto f1 = thrust::async::reduce(d0.begin(), d0.end());
+
+      ASSERT_EQUAL(true, f1.valid_stream());
+      ASSERT_EQUAL(true, f1.valid_content());
+
+      ASSERT_EQUAL_QUIET(f0_raw_data, f1.raw_data());
+
+      // This potentially runs concurrently with the copies.
+      T const r0 = thrust::reduce(h0.begin(), h0.end());
+
+      T const r1 = TEST_FUTURE_VALUE_RETRIEVAL(f1);
+
+      ASSERT_EQUAL(r0, r1);
+    }
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_reduce_caching
+, NumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_async_copy_then_reduce
+{
+  __host__
+  void operator()(std::size_t n)
+  {
+    thrust::host_vector<T>   h0a(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0b(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0c(unittest::random_integers<T>(n));
+    thrust::host_vector<T>   h0d(unittest::random_integers<T>(n));
+
+    thrust::device_vector<T> d0a(n);
+    thrust::device_vector<T> d0b(n);
+    thrust::device_vector<T> d0c(n);
+    thrust::device_vector<T> d0d(n);
+
+    auto f0a = thrust::async::copy(h0a.begin(), h0a.end(), d0a.begin());
+    auto f0b = thrust::async::copy(h0b.begin(), h0b.end(), d0b.begin());
+    auto f0c = thrust::async::copy(h0c.begin(), h0c.end(), d0c.begin());
+    auto f0d = thrust::async::copy(h0d.begin(), h0d.end(), d0d.begin());
+
+    ASSERT_EQUAL(true, f0a.valid_stream());
+    ASSERT_EQUAL(true, f0b.valid_stream());
+    ASSERT_EQUAL(true, f0c.valid_stream());
+    ASSERT_EQUAL(true, f0d.valid_stream());
+
+    auto const f0a_stream = f0a.stream().native_handle();
+    auto const f0b_stream = f0b.stream().native_handle();
+    auto const f0c_stream = f0c.stream().native_handle();
+    auto const f0d_stream = f0d.stream().native_handle();
+
+    auto f1a = thrust::async::reduce(
+      thrust::device.after(f0a), d0a.begin(), d0a.end()
+    );
+    auto f1b = thrust::async::reduce(
+      thrust::device.after(f0b), d0b.begin(), d0b.end()
+    );
+    auto f1c = thrust::async::reduce(
+      thrust::device.after(f0c), d0c.begin(), d0c.end()
+    );
+    auto f1d = thrust::async::reduce(
+      thrust::device.after(f0d), d0d.begin(), d0d.end()
+    );
+
+    ASSERT_EQUAL(false, f0a.valid_stream());
+    ASSERT_EQUAL(false, f0b.valid_stream());
+    ASSERT_EQUAL(false, f0c.valid_stream());
+    ASSERT_EQUAL(false, f0d.valid_stream());
+
+    ASSERT_EQUAL(true, f1a.valid_stream());
+    ASSERT_EQUAL(true, f1a.valid_content());
+    ASSERT_EQUAL(true, f1b.valid_stream());
+    ASSERT_EQUAL(true, f1b.valid_content());
+    ASSERT_EQUAL(true, f1c.valid_stream());
+    ASSERT_EQUAL(true, f1c.valid_content());
+    ASSERT_EQUAL(true, f1d.valid_stream());
+    ASSERT_EQUAL(true, f1d.valid_content());
+
+    // Verify that streams were stolen.
+    ASSERT_EQUAL_QUIET(f0a_stream, f1a.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0b_stream, f1b.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0c_stream, f1c.stream().native_handle());
+    ASSERT_EQUAL_QUIET(f0d_stream, f1d.stream().native_handle());
+
+    // This potentially runs concurrently with the copies.
+    T const r0 = thrust::reduce(h0a.begin(), h0a.end());
+
+    T const r1a = TEST_FUTURE_VALUE_RETRIEVAL(f1a);
+    T const r1b = TEST_FUTURE_VALUE_RETRIEVAL(f1b);
+    T const r1c = TEST_FUTURE_VALUE_RETRIEVAL(f1c);
+    T const r1d = TEST_FUTURE_VALUE_RETRIEVAL(f1d);
+
+    ASSERT_EQUAL(r0, r1a);
+    ASSERT_EQUAL(r0, r1b);
+    ASSERT_EQUAL(r0, r1c);
+    ASSERT_EQUAL(r0, r1d);
+  }
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(
+  test_async_copy_then_reduce
+, BuiltinNumericTypes
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: when_all from reductions.
+
+#endif
+
diff --git a/testing/async_reduce_into.cu b/testing/async_reduce_into.cu
new file mode 100644
index 000000000..a4a2be99e
--- /dev/null
+++ b/testing/async_reduce_into.cu
@@ -0,0 +1,625 @@
+#define THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/reduce.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/device_make_unique.h>
+
+template <typename T>
+struct custom_plus
+{
+  __host__ __device__
+  T operator()(T lhs, T rhs) const
+  {
+    return lhs + rhs;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(                            \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::reduce_into(                                           \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_REDUCE_INTO_INVOKER(NAME, ...)                           \
+  DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(                                  \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_REDUCE_INVOKER(NAME, ...)                                 \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::reduce(                                                       \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, thrust::plus<T>()
+);
+
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_init_custom_plus
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_init_custom_plus
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+DEFINE_STATEFUL_ASYNC_REDUCE_INTO_INVOKER(
+  reduce_into_async_invoker_device_allocator_on_init_custom_plus
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::reduce_into`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+DEFINE_SYNC_REDUCE_INVOKER(
+  reduce_sync_invoker_init_custom_plus
+, THRUST_FWD(first), THRUST_FWD(last)
+, unittest::random_integer<T>()
+, custom_plus<T>()
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncReduceIntoInvoker
+, template <typename> class SyncReduceIntoInvoker
+>
+struct test_async_reduce_into
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      auto s0a = thrust::device_make_unique<T>();
+      auto s0b = thrust::device_make_unique<T>();
+      auto s0c = thrust::device_make_unique<T>();
+      auto s0d = thrust::device_make_unique<T>();
+
+      auto const s0a_ptr = s0a.get();
+      auto const s0b_ptr = s0b.get();
+      auto const s0c_ptr = s0c.get();
+      auto const s0d_ptr = s0d.get();
+
+      AsyncReduceIntoInvoker<T> invoke_async;
+      SyncReduceIntoInvoker<T>  invoke_sync;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), s0a_ptr);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), s0b_ptr);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), s0c_ptr);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), s0d_ptr);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      auto const r0 = invoke_sync(h0.begin(), h0.end());
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(r0, *s0a_ptr);
+      ASSERT_EQUAL(r0, *s0b_ptr);
+      ASSERT_EQUAL(r0, *s0c_ptr);
+      ASSERT_EQUAL(r0, *s0d_ptr);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on
+    , reduce_sync_invoker
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init
+    , reduce_sync_invoker_init
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init_plus
+    , reduce_sync_invoker_init_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_on_init_custom_plus
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_reduce_into<
+      reduce_into_async_invoker_device_allocator_on_init_custom_plus
+    , reduce_sync_invoker_init_custom_plus
+    >::tester
+  )
+, NumericTypes
+, test_async_reduce_into_policy_allocator_on_init_custom_plus
+);
+
+#endif
+
diff --git a/testing/async_sort.cu b/testing/async_sort.cu
new file mode 100644
index 000000000..c5cfeae23
--- /dev/null
+++ b/testing/async_sort.cu
@@ -0,0 +1,334 @@
+#include <thrust/detail/config.h>
+
+// Disabled on MSVC && NVCC < 11.1 for GH issue #1098.
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && defined(__CUDACC__)
+#if (__CUDACC_VER_MAJOR__ < 11) || (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ < 1)
+#define THRUST_BUG_1098_ACTIVE
+#endif // NVCC version check
+#endif // MSVC + NVCC check
+
+#if THRUST_CPP_DIALECT >= 2014 && !defined(THRUST_BUG_1098_ACTIVE)
+
+#include <unittest/unittest.h>
+
+#include <thrust/async/sort.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+enum wait_policy
+{
+  wait_for_futures
+, do_not_wait_for_futures
+};
+
+template <typename T>
+struct custom_greater
+{
+  __host__ __device__
+  bool operator()(T rhs, T lhs) const
+  {
+    return lhs > rhs;
+  }
+};
+
+#define DEFINE_SORT_INVOKER(name, ...)                                        \
+  template <typename T>                                                       \
+  struct name                                                                 \
+  {                                                                           \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static void sync(                                                         \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    {                                                                         \
+      ::thrust::sort(                                                         \
+        THRUST_FWD(first), THRUST_FWD(last)                                   \
+      );                                                                      \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static auto async(                                                        \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::sort(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last)                                   \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_SORT_INVOKER(
+  sort_invoker
+);
+DEFINE_SORT_INVOKER(
+  sort_invoker_device, thrust::device
+);
+
+#define DEFINE_SORT_OP_INVOKER(name, op, ...)                                 \
+  template <typename T>                                                       \
+  struct name                                                                 \
+  {                                                                           \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static void sync(                                                         \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    {                                                                         \
+      ::thrust::sort(                                                         \
+        THRUST_FWD(first), THRUST_FWD(last), op<T>{}                          \
+      );                                                                      \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel                                   \
+    >                                                                         \
+    __host__                                                                  \
+    static auto async(                                                        \
+      ForwardIt&& first, Sentinel&& last                                      \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::async::sort(                                                  \
+        __VA_ARGS__                                                           \
+        THRUST_PP_COMMA_IF(THRUST_PP_ARITY(__VA_ARGS__))                      \
+        THRUST_FWD(first), THRUST_FWD(last), op<T>{}                          \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_less,        thrust::less
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_less_device, thrust::less, thrust::device 
+);
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_greater,        thrust::greater
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_greater_device, thrust::greater, thrust::device 
+);
+
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_custom_greater,        custom_greater
+);
+DEFINE_SORT_OP_INVOKER(
+  sort_invoker_custom_greater_device, custom_greater, thrust::device 
+);
+
+#undef DEFINE_SORT_INVOKER
+#undef DEFINE_SORT_OP_INVOKER
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <template <typename> class SortInvoker, wait_policy WaitPolicy>
+struct test_async_sort
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0_data(unittest::random_integers<T>(n));
+      thrust::device_vector<T> d0_data(h0_data);
+
+      ASSERT_EQUAL(h0_data, d0_data);
+
+      SortInvoker<T>::sync(
+        h0_data.begin(), h0_data.end()
+      );
+
+      auto f0 = SortInvoker<T>::async(
+        d0_data.begin(), d0_data.end()
+      );
+
+      THRUST_IF_CONSTEXPR(wait_for_futures == WaitPolicy)
+      {
+        f0.wait();
+
+        ASSERT_EQUAL(h0_data, d0_data);
+      }
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_less
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_less_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_less
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_less_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_less_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_greater_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_custom_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_custom_greater_no_wait
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater_device
+    , wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_custom_greater
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_sort<
+      sort_invoker_custom_greater_device
+    , do_not_wait_for_futures
+    >::tester
+  )
+, NumericTypes
+, test_async_sort_policy_custom_greater_no_wait
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO: Async copy then sort.
+
+// TODO: Test future return type.
+
+#endif
+
diff --git a/testing/async_transform.cu b/testing/async_transform.cu
new file mode 100644
index 000000000..efaa885f0
--- /dev/null
+++ b/testing/async_transform.cu
@@ -0,0 +1,533 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/async/transform.h>
+#include <thrust/async/copy.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+
+template <typename T>
+struct divide_by_2
+{
+  __host__ __device__
+  T operator()(T x) const
+  {
+    return x / 2;
+  }
+};
+
+#define DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(                        \
+    NAME, MEMBERS, CTOR, DTOR, VALIDATE, ...                                  \
+  )                                                                           \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+    MEMBERS                                                                   \
+                                                                              \
+    NAME() { CTOR }                                                           \
+                                                                              \
+    ~NAME() { DTOR }                                                          \
+                                                                              \
+    template <typename Event>                                                 \
+    void validate_event(Event& e)                                             \
+    {                                                                         \
+      THRUST_UNUSED_VAR(e);                                                   \
+      VALIDATE                                                                \
+    }                                                                         \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    , typename UnaryOperation                                                 \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    , UnaryOperation&& op                                                     \
+    )                                                                         \
+    THRUST_DECLTYPE_RETURNS(                                                  \
+      ::thrust::async::transform(                                             \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+#define DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(NAME, ...)                       \
+  DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(                              \
+    NAME                                                                      \
+  , THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY(), THRUST_PP_EMPTY()\
+  , __VA_ARGS__                                                               \
+  )                                                                           \
+  /**/
+
+#define DEFINE_SYNC_TRANSFORM_UNARY_INVOKER(NAME, ...)                        \
+  template <typename T>                                                       \
+  struct NAME                                                                 \
+  {                                                                           \
+                                                                              \
+    template <                                                                \
+      typename ForwardIt, typename Sentinel, typename OutputIt                \
+    , typename UnaryOperation                                                 \
+    >                                                                         \
+    __host__                                                                  \
+    auto operator()(                                                          \
+      ForwardIt&& first, Sentinel&& last, OutputIt&& output                   \
+    , UnaryOperation&& op                                                     \
+    )                                                                         \
+    THRUST_RETURNS(                                                           \
+      ::thrust::transform(                                                    \
+        __VA_ARGS__                                                           \
+      )                                                                       \
+    )                                                                         \
+  };                                                                          \
+  /**/
+
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device
+, thrust::device
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_allocator
+, thrust::device(thrust::device_allocator<void>{})
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::transform`.
+, thrust::device.on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+DEFINE_STATEFUL_ASYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_async_invoker_device_allocator_on
+  // Members.
+, cudaStream_t stream_;
+  // Constructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking)
+  );
+  // Destructor.
+, thrust::cuda_cub::throw_on_error(
+    cudaStreamDestroy(stream_)
+  );
+  // `validate_event` member.
+, ASSERT_EQUAL_QUIET(stream_, e.stream().native_handle());
+  // Arguments to `thrust::async::transform`.
+, thrust::device(thrust::device_allocator<void>{}).on(stream_)
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+
+DEFINE_SYNC_TRANSFORM_UNARY_INVOKER(
+  transform_unary_sync_invoker
+, THRUST_FWD(first), THRUST_FWD(last)
+, THRUST_FWD(output)
+, THRUST_FWD(op)
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      thrust::host_vector<T>   h1(n);
+
+      thrust::device_vector<T> d1a(n);
+      thrust::device_vector<T> d1b(n);
+      thrust::device_vector<T> d1c(n);
+      thrust::device_vector<T> d1d(n);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), d1a.begin(), op);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), d1b.begin(), op);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), d1c.begin(), op);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), d1d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(h0.begin(), h0.end(), h1.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      ASSERT_EQUAL(h1, d1a);
+      ASSERT_EQUAL(h1, d1b);
+      ASSERT_EQUAL(h1, d1c);
+      ASSERT_EQUAL(h1, d1d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_allocator
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_allocator_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_on_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary<
+      transform_unary_async_invoker_device_allocator_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_policy_allocator_on_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary_inplace
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+      thrust::device_vector<T> d0c(h0);
+      thrust::device_vector<T> d0d(h0);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+
+      auto f0a = invoke_async(d0a.begin(), d0a.end(), d0a.begin(), op);
+      auto f0b = invoke_async(d0b.begin(), d0b.end(), d0b.begin(), op);
+      auto f0c = invoke_async(d0c.begin(), d0c.end(), d0c.begin(), op);
+      auto f0d = invoke_async(d0d.begin(), d0d.end(), d0d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(h0.begin(), h0.end(), h0.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_allocator
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_allocator_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_on_divide_by_2
+);
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_inplace<
+      transform_unary_async_invoker_device_allocator_on
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, NumericTypes
+, test_async_transform_unary_inplace_policy_allocator_on_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class AsyncTransformUnaryInvoker
+, template <typename> class SyncTransformUnaryInvoker
+, template <typename> class UnaryOperation
+>
+struct test_async_transform_unary_counting_iterator
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()()
+    {
+      constexpr std::size_t n = 15 * sizeof(T);
+
+      ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
+
+      thrust::counting_iterator<T> first(0);
+      thrust::counting_iterator<T> last(n);
+
+      thrust::host_vector<T>   h0(n);
+
+      thrust::device_vector<T> d0a(n);
+      thrust::device_vector<T> d0b(n);
+      thrust::device_vector<T> d0c(n);
+      thrust::device_vector<T> d0d(n);
+
+      AsyncTransformUnaryInvoker<T> invoke_async;
+      SyncTransformUnaryInvoker<T>  invoke_sync;
+
+      UnaryOperation<T> op;
+
+      auto f0a = invoke_async(first, last, d0a.begin(), op);
+      auto f0b = invoke_async(first, last, d0b.begin(), op);
+      auto f0c = invoke_async(first, last, d0c.begin(), op);
+      auto f0d = invoke_async(first, last, d0d.begin(), op);
+
+      invoke_async.validate_event(f0a);
+      invoke_async.validate_event(f0b);
+      invoke_async.validate_event(f0c);
+      invoke_async.validate_event(f0d);
+
+      // This potentially runs concurrently with the copies.
+      invoke_sync(first, last, h0.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b, f0c, f0d));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+      ASSERT_EQUAL(h0, d0c);
+      ASSERT_EQUAL(h0, d0d);
+    }
+  };
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_counting_iterator<
+      transform_unary_async_invoker
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_transform_unary_counting_iterator_divide_by_2
+);
+DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND_ARGS(
+    test_async_transform_unary_counting_iterator<
+      transform_unary_async_invoker_device
+    , transform_unary_sync_invoker
+    , divide_by_2
+    >::tester
+  )
+, BuiltinNumericTypes
+, test_async_transform_unary_counting_iterator_policy_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  template <typename> class UnaryOperation
+>
+struct test_async_transform_using
+{
+  template <typename T>
+  struct tester
+  {
+    __host__
+    void operator()(std::size_t n)
+    {
+      thrust::host_vector<T>   h0(unittest::random_integers<T>(n));
+
+      thrust::device_vector<T> d0a(h0);
+      thrust::device_vector<T> d0b(h0);
+
+      thrust::host_vector<T>   h1(n);
+
+      thrust::device_vector<T> d1a(n);
+      thrust::device_vector<T> d1b(n);
+
+      UnaryOperation<T> op;
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+
+      thrust::device_event f0a;
+      thrust::device_event f0b;
+
+      // When you import the customization points into the global namespace,
+      // they should be selected instead of the synchronous algorithms.
+      {
+        using namespace thrust::async;
+        f0a = transform(d0a.begin(), d0a.end(), d1a.begin(), op);
+      }
+      {
+        using thrust::async::transform;
+        f0b = transform(d0b.begin(), d0b.end(), d1b.begin(), op);
+      }
+
+      // ADL should find the synchronous algorithms.
+      // This potentially runs concurrently with the copies.
+      transform(h0.begin(), h0.end(), h1.begin(), op);
+
+      TEST_EVENT_WAIT(thrust::when_all(f0a, f0b));
+
+      ASSERT_EQUAL(h0, d0a);
+      ASSERT_EQUAL(h0, d0b);
+
+      ASSERT_EQUAL(h1, d1a);
+      ASSERT_EQUAL(h1, d1b);
+    }
+  };
+};
+DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(
+  THRUST_PP_EXPAND(test_async_transform_using<divide_by_2>::tester)
+, NumericTypes
+, test_async_transform_using_divide_by_2
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/testing/backend/SConscript b/testing/backend/SConscript
deleted file mode 100644
index ed6acc87b..000000000
--- a/testing/backend/SConscript
+++ /dev/null
@@ -1,19 +0,0 @@
-import os
-
-Import('env')
-
-extensions = ['*.cu', '*.cpp']
-
-# gather sources in .
-sources = []
-for ext in extensions:
-  sources.extend(env.Glob(ext))
-
-# recursively glob sources from children
-for ext in extensions:
-  sources.extend(env.RecursiveGlob(ext, 'generic'))
-  sources.extend(env.RecursiveGlob(ext, env['device_backend']))
-
-# return the result to the parent
-Return('sources')
-
diff --git a/testing/backend/cuda/arch.cu b/testing/backend/cuda/arch.cu
deleted file mode 100644
index 1e3b81c5b..000000000
--- a/testing/backend/cuda/arch.cu
+++ /dev/null
@@ -1,244 +0,0 @@
-#include <unittest/unittest.h>
-
-#if defined(__CUDACC__)
-
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-
-using namespace thrust::system::cuda::detail;
-
-void set_compute_capability(device_properties_t& properties, int major, int minor)
-{
-  properties.major = major;
-  properties.minor = minor;
-}
-
-void set_G80(device_properties_t& properties)
-{
-  set_compute_capability(properties, 1, 0);
-  properties.multiProcessorCount         = 16;
-  properties.sharedMemPerBlock           = 16384;
-  properties.regsPerBlock                = 8192;
-  properties.warpSize                    = 32;
-  properties.maxThreadsPerBlock          = 512;
-  properties.maxThreadsPerMultiProcessor = 768;
-}
-
-void set_G84(device_properties_t& properties)
-{
-  set_compute_capability(properties, 1, 1);
-  properties.multiProcessorCount         = 4;
-  properties.sharedMemPerBlock           = 16384;
-  properties.regsPerBlock                = 8192;
-  properties.warpSize                    = 32;
-  properties.maxThreadsPerBlock          = 512;
-  properties.maxThreadsPerMultiProcessor = 768;
-}
-
-void set_GT200(device_properties_t& properties)
-{
-  set_compute_capability(properties, 1, 3);
-  properties.multiProcessorCount         = 30;
-  properties.sharedMemPerBlock           = 16384;
-  properties.regsPerBlock                = 16384;
-  properties.warpSize                    = 32;
-  properties.maxThreadsPerBlock          = 512;
-  properties.maxThreadsPerMultiProcessor = 1024;
-}
-
-void set_unknown(device_properties_t& properties)
-{
-  set_compute_capability(properties, 900, 1);
-  properties.multiProcessorCount         = 9001;
-  properties.sharedMemPerBlock           = 4 * 16384;
-  properties.regsPerBlock                = 32768;
-  properties.warpSize                    = 32;
-  properties.maxThreadsPerBlock          = 4096;
-  properties.maxThreadsPerMultiProcessor = 8192;
-}
-
-void set_func_attributes(function_attributes_t& attributes,
-                         size_t constSizeBytes,           // Size of constant memory in bytes.
-                         size_t localSizeBytes,           // Size of local memory in bytes.
-                         int maxThreadsPerBlock,          // Maximum number of threads per block.
-                         int numRegs,                     // Number of registers used.
-                         size_t sharedSizeBytes)          // Size of shared memory in bytes.
-{
-    attributes.constSizeBytes     = constSizeBytes;
-    attributes.localSizeBytes     = localSizeBytes;
-    attributes.maxThreadsPerBlock = maxThreadsPerBlock; 
-    attributes.numRegs            = numRegs;
-    attributes.sharedSizeBytes    = sharedSizeBytes;
-}
-
-void TestComputeCapability(void)
-{
-    device_properties_t properties;
-
-    set_compute_capability(properties, 1, 0);
-    ASSERT_EQUAL(compute_capability(properties), 10);
-
-    set_compute_capability(properties, 1, 1);
-    ASSERT_EQUAL(compute_capability(properties), 11);
-    
-    set_compute_capability(properties, 1, 3);
-    ASSERT_EQUAL(compute_capability(properties), 13);
-    
-    set_compute_capability(properties, 2, 0);
-    ASSERT_EQUAL(compute_capability(properties), 20);
-    
-    set_compute_capability(properties, 2, 1);
-    ASSERT_EQUAL(compute_capability(properties), 21);
-}
-DECLARE_UNITTEST(TestComputeCapability);
-
-
-void TestMaxActiveBlocks(void)
-{
-    using namespace cuda_launch_config_detail;
-
-    device_properties_t   properties;
-    function_attributes_t attributes;
-
-    // Kernel #1 : Full Occupancy on all devices
-    set_func_attributes(attributes, 0, 0, 512, 10, 2048);
-    
-    set_G80(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 3);
-    set_G84(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 3);
-    set_GT200(properties); ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 4);
-    
-    // Kernel #2 : 2/3rds Occupancy on G8x and 100% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 16, 2048);
-
-    set_G80(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-    set_G84(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-    set_GT200(properties); ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 4);
-    
-    // Kernel #3 : 1/3rds Occupancy on G8x and 75% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 20, 2048);
-
-    set_G80(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 1);
-    set_G84(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 1);
-    set_GT200(properties); ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 3);
-    
-    // Kernel #4 : 1/3rds Occupancy on G8x and 50% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 21, 2048);
-
-    set_G80(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 1);
-    set_G84(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 1);
-    set_GT200(properties); ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-    
-    // Kernel #5 : 2/3rds Occupancy on G8x and 50% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 10, 8192);
-
-    set_G80(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-    set_G84(properties);   ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-    set_GT200(properties); ASSERT_EQUAL(max_active_blocks_per_multiprocessor(properties, attributes, 256, 0), 2);
-}
-DECLARE_UNITTEST(TestMaxActiveBlocks);
-
-
-void TestMaxBlocksizeWithHighestOccupancy(void)
-{
-    device_properties_t   properties;
-    function_attributes_t attributes;
-    
-    // Kernel #1 : Full Occupancy on all devices
-    set_func_attributes(attributes, 0, 0, 512, 10, 2048);
-    
-    set_G80(properties);   ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 384);
-    set_GT200(properties); ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 512);
-    
-    // Kernel #2 : 2/3rds Occupancy on G8x and 100% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 16, 2048);
-
-    set_G80(properties);   ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 512);
-    set_GT200(properties); ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 512);
-    
-    // Kernel #3 : 50% Occupancy on G8x and 75% on GT200
-    set_func_attributes(attributes, 0, 0, 256, 20, 2048);
-    
-    set_G80(properties);   ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 192);
-    set_GT200(properties); ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 256);
-    
-    // Kernel #4 : 1/3rds Occupancy on G8x and 50% on GT200
-    set_func_attributes(attributes, 0, 0, 384, 26, 2048);
-    
-    set_G80(properties);   ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 256);
-    set_GT200(properties); ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 192);
-    
-    // Kernel #5 :100% Occupancy on G8x and GT200
-    set_func_attributes(attributes, 0, 0, 512, 10, 8192);
-    
-    set_G80(properties);   ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 384);
-    set_GT200(properties); ASSERT_EQUAL(block_size_with_maximum_potential_occupancy(attributes, properties), 512);
-}
-DECLARE_UNITTEST(TestMaxBlocksizeWithHighestOccupancy);
-
-struct return_int
-{
-  int val;
-
-  return_int(int val)
-    : val(val)
-  {}
-
-  __host__ __device__
-  int operator()(int) const
-  {
-    return val;
-  }
-};
-
-static bool validate_nonzero_results(const device_properties_t   &properties,
-                                     const function_attributes_t &attributes)
-{
-  using thrust::system::cuda::detail::cuda_launch_config_detail::max_active_blocks_per_multiprocessor;
-
-  bool result = true;
-
-  // validate that all these calls return something non-zero
-  result &= (max_active_blocks_per_multiprocessor(properties, attributes, 512, 512 * 4) > 0);
-  ASSERT_EQUAL(true, result);
-
-  result &= block_size_with_maximum_potential_occupancy(attributes, properties) > 0;
-  ASSERT_EQUAL(true, result);
-
-  result &= block_size_with_maximum_potential_occupancy(attributes, properties, return_int(4)) > 0;
-  ASSERT_EQUAL(true, result);
-
-  return result;
-}
-
-void TestUnknownDeviceRobustness(void)
-{
-    device_properties_t  properties;
-    function_attributes_t attributes;
-
-    // create an unknown device
-    set_unknown(properties);
-
-    // Kernel #1 : Full Occupancy on all real devices
-    set_func_attributes(attributes, 0, 0, 512, 10, 2048);
-    ASSERT_EQUAL(true, validate_nonzero_results(properties, attributes));
-
-    // Kernel #2 : 2/3rds Occupancy on G8x and 100% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 16, 2048);
-    ASSERT_EQUAL(true, validate_nonzero_results(properties, attributes));
-
-    // Kernel #3 : 50% Occupancy on G8x and 75% on GT200
-    set_func_attributes(attributes, 0, 0, 512, 20, 2048);
-    ASSERT_EQUAL(true, validate_nonzero_results(properties, attributes));
-
-    // Kernel #4 : 1/3rds Occupancy on G8x and 50% on GT200
-    set_func_attributes(attributes, 0, 0, 384, 26, 2048);
-    ASSERT_EQUAL(true, validate_nonzero_results(properties, attributes));
-
-    // Kernel #5 :100% Occupancy on G8x and GT200
-    set_func_attributes(attributes, 0, 0, 512, 10, 8192);
-    ASSERT_EQUAL(true, validate_nonzero_results(properties, attributes));
-}
-DECLARE_UNITTEST(TestUnknownDeviceRobustness);
-
-#endif // defined(__CUDACC__)
-
diff --git a/testing/backend/cuda/pinned_allocator.cu b/testing/backend/cuda/pinned_allocator.cu
deleted file mode 100644
index 23ccc7d40..000000000
--- a/testing/backend/cuda/pinned_allocator.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
-#include <thrust/host_vector.h>
-#include <thrust/copy.h>
-
-template <typename T>
-void TestPinnedAllocatorSimple(const size_t n)
-{
-  typedef thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T> > Vector;
-
-  Vector h_input = unittest::random_integers<T>(n);
-  Vector h_output(n);
-
-  thrust::copy(h_input.begin(), h_input.end(), h_output.begin());
-
-  ASSERT_EQUAL(h_input, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestPinnedAllocatorSimple);
-
diff --git a/testing/backend/cuda/radix_sort.cu b/testing/backend/cuda/radix_sort.cu
deleted file mode 100644
index 356a70210..000000000
--- a/testing/backend/cuda/radix_sort.cu
+++ /dev/null
@@ -1,116 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/device_malloc_allocator.h>
-
-#include <thrust/sort.h>
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
-
-using namespace unittest;
-
-template <class Vector>
-void InitializeSimpleKeyRadixSortTest(Vector& unsorted_keys, Vector& sorted_keys)
-{
-    unsorted_keys.resize(7);
-    unsorted_keys[0] = 1; 
-    unsorted_keys[1] = 3; 
-    unsorted_keys[2] = 6;
-    unsorted_keys[3] = 5;
-    unsorted_keys[4] = 2;
-    unsorted_keys[5] = 0;
-    unsorted_keys[6] = 4;
-
-    sorted_keys.resize(7); 
-    sorted_keys[0] = 0; 
-    sorted_keys[1] = 1; 
-    sorted_keys[2] = 2;
-    sorted_keys[3] = 3;
-    sorted_keys[4] = 4;
-    sorted_keys[5] = 5;
-    sorted_keys[6] = 6;
-}
-
-template <class Vector>
-void InitializeSimpleStableKeyRadixSortTest(Vector& unsorted_keys, Vector& sorted_keys)
-{
-    unsorted_keys.resize(9);   
-    unsorted_keys[0] = 25; 
-    unsorted_keys[1] = 14; 
-    unsorted_keys[2] = 35; 
-    unsorted_keys[3] = 16; 
-    unsorted_keys[4] = 26; 
-    unsorted_keys[5] = 34; 
-    unsorted_keys[6] = 36; 
-    unsorted_keys[7] = 24; 
-    unsorted_keys[8] = 15; 
-    
-    sorted_keys.resize(9);
-    sorted_keys[0] = 14; 
-    sorted_keys[1] = 16; 
-    sorted_keys[2] = 15; 
-    sorted_keys[3] = 25; 
-    sorted_keys[4] = 26; 
-    sorted_keys[5] = 24; 
-    sorted_keys[6] = 35; 
-    sorted_keys[7] = 34; 
-    sorted_keys[8] = 36; 
-}
-
-
-template <class Vector>
-struct TestRadixSortKeySimple
-{
-  void operator()(const size_t dummy)
-  {
-    typedef typename Vector::value_type T;
-
-    Vector unsorted_keys;
-    Vector   sorted_keys;
-
-    InitializeSimpleKeyRadixSortTest(unsorted_keys, sorted_keys);
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), thrust::less<T>());
-
-    ASSERT_EQUAL(unsorted_keys, sorted_keys);
-  }
-};
-VectorUnitTest<TestRadixSortKeySimple, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_malloc_allocator> TestRadixSortKeySimpleDeviceInstance;
-
-
-typedef unittest::type_list<
-#if !(defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 1))
-// XXX GCC 4.1 miscompiles the char sorts with -O2 for some reason
-                            char,
-                            signed char,
-                            unsigned char,
-#endif
-                            short,
-                            unsigned short,
-                            int,
-                            unsigned int,
-                            long,
-                            unsigned long,
-                            long long,
-                            unsigned long long,
-                            float,
-                            double> RadixSortKeyTypes;
-
-template <typename T>
-struct TestRadixSort
-{
-  void operator()(const size_t n)
-  {
-    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
-    thrust::device_vector<T> d_keys = h_keys;
-
-    thrust::stable_sort(h_keys.begin(), h_keys.end());
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort(cuda_tag, d_keys.begin(), d_keys.end(), thrust::less<T>());
-
-    ASSERT_ALMOST_EQUAL(h_keys, d_keys);
-  }
-};
-VariableUnitTest<TestRadixSort, RadixSortKeyTypes> TestRadixSortInstance;
-
diff --git a/testing/backend/cuda/radix_sort_by_key.cu b/testing/backend/cuda/radix_sort_by_key.cu
deleted file mode 100644
index b18e77380..000000000
--- a/testing/backend/cuda/radix_sort_by_key.cu
+++ /dev/null
@@ -1,121 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/device_malloc_allocator.h>
-
-#include <thrust/sort.h>
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
-
-using namespace unittest;
-
-template <class Vector>
-void InitializeSimpleKeyRadixSortTest(Vector& unsorted_keys, Vector& sorted_keys)
-{
-    unsorted_keys.resize(7);
-    unsorted_keys[0] = 1; 
-    unsorted_keys[1] = 3; 
-    unsorted_keys[2] = 6;
-    unsorted_keys[3] = 5;
-    unsorted_keys[4] = 2;
-    unsorted_keys[5] = 0;
-    unsorted_keys[6] = 4;
-
-    sorted_keys.resize(7); 
-    sorted_keys[0] = 0; 
-    sorted_keys[1] = 1; 
-    sorted_keys[2] = 2;
-    sorted_keys[3] = 3;
-    sorted_keys[4] = 4;
-    sorted_keys[5] = 5;
-    sorted_keys[6] = 6;
-}
-
-template <class Vector>
-void InitializeSimpleKeyValueRadixSortTest(Vector& unsorted_keys, Vector& unsorted_values,
-                                           Vector& sorted_keys,   Vector& sorted_values)
-{
-    unsorted_keys.resize(7);   
-    unsorted_values.resize(7);   
-    unsorted_keys[0] = 1;  unsorted_values[0] = 0;
-    unsorted_keys[1] = 3;  unsorted_values[1] = 1;
-    unsorted_keys[2] = 6;  unsorted_values[2] = 2;
-    unsorted_keys[3] = 5;  unsorted_values[3] = 3;
-    unsorted_keys[4] = 2;  unsorted_values[4] = 4;
-    unsorted_keys[5] = 0;  unsorted_values[5] = 5;
-    unsorted_keys[6] = 4;  unsorted_values[6] = 6;
-    
-    sorted_keys.resize(7);
-    sorted_values.resize(7);
-    sorted_keys[0] = 0;  sorted_values[1] = 0;  
-    sorted_keys[1] = 1;  sorted_values[3] = 1;  
-    sorted_keys[2] = 2;  sorted_values[6] = 2;
-    sorted_keys[3] = 3;  sorted_values[5] = 3;
-    sorted_keys[4] = 4;  sorted_values[2] = 4;
-    sorted_keys[5] = 5;  sorted_values[0] = 5;
-    sorted_keys[6] = 6;  sorted_values[4] = 6;
-}
-
-template <class Vector>
-struct TestRadixSortKeyValueSimple
-{
-  void operator()(const size_t dummy)
-  {
-    typedef typename Vector::value_type T;
-
-    Vector unsorted_keys, unsorted_values;
-    Vector   sorted_keys,   sorted_values;
-
-    InitializeSimpleKeyValueRadixSortTest(unsorted_keys, unsorted_values, sorted_keys, sorted_values);
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort_by_key(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin(), thrust::less<T>());
-
-    ASSERT_EQUAL(unsorted_keys,   sorted_keys);
-    ASSERT_EQUAL(unsorted_values, sorted_values);
-  }
-};
-VectorUnitTest<TestRadixSortKeyValueSimple, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_malloc_allocator> TestRadixSortKeyValueSimpleDeviceInstance;
-
-
-typedef unittest::type_list<
-#if !(defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 1))
-// XXX GCC 4.1 miscompiles the char sorts with -O2 for some reason
-                            char,
-                            signed char,
-                            unsigned char,
-#endif
-                            short,
-                            unsigned short,
-                            int,
-                            unsigned int,
-                            long,
-                            unsigned long,
-                            long long,
-                            unsigned long long,
-                            float,
-                            double> RadixSortKeyTypes;
-
-template <typename T>
-struct TestRadixSortByKey
-{
-  void operator()(const size_t n)
-  {
-    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
-    thrust::device_vector<T> d_keys = h_keys;
-
-    thrust::host_vector<unsigned int>   h_values(n);
-    thrust::device_vector<unsigned int> d_values(n);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
-
-    ASSERT_ALMOST_EQUAL(h_keys, d_keys);
-    ASSERT_ALMOST_EQUAL(h_values, d_values);
-  }
-};
-VariableUnitTest<TestRadixSortByKey, RadixSortKeyTypes> TestRadixSortByKeyInstance;
-
diff --git a/testing/backend/cuda/radix_sort_by_key_values.cu b/testing/backend/cuda/radix_sort_by_key_values.cu
deleted file mode 100644
index 5b700e2ba..000000000
--- a/testing/backend/cuda/radix_sort_by_key_values.cu
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/functional.h>
-#include <thrust/sequence.h>
-#include <thrust/device_malloc_allocator.h>
-
-#include <thrust/sort.h>
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-
-typedef unittest::type_list<
-#if !(defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ <= 1))
-// XXX GCC 4.1 miscompiles the char sorts with -O2 for some reason
-                            unsigned char,
-#endif
-                            unsigned short,
-                            unsigned int,
-                            unsigned long,
-                            unsigned long long> UnsignedIntegerTypes;
-
-template <typename T>
-struct TestRadixSortByKeyShortValues
-{
-  void operator()(const size_t n)
-  {
-    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
-    thrust::device_vector<T> d_keys = h_keys;
-    
-    thrust::host_vector<short>   h_values(n);
-    thrust::device_vector<short> d_values(n);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
-
-    ASSERT_ALMOST_EQUAL(h_keys, d_keys);
-    ASSERT_ALMOST_EQUAL(h_values, d_values);
-  }
-};
-VariableUnitTest<TestRadixSortByKeyShortValues, UnsignedIntegerTypes> TestRadixSortByKeyShortValuesInstance;
-
-template <typename T>
-struct TestRadixSortByKeyLongLongValues
-{
-  void operator()(const size_t n)
-  {
-    thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
-    thrust::device_vector<T> d_keys = h_keys;
-    
-    thrust::host_vector<long long>   h_values(n);
-    thrust::device_vector<long long> d_values(n);
-    thrust::sequence(h_values.begin(), h_values.end());
-    thrust::sequence(d_values.begin(), d_values.end());
-
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin());
-
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_radix_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
-
-    ASSERT_ALMOST_EQUAL(h_keys, d_keys);
-    ASSERT_ALMOST_EQUAL(h_values, d_values);
-  }
-};
-VariableUnitTest<TestRadixSortByKeyLongLongValues, UnsignedIntegerTypes> TestRadixSortByKeyLongLongValuesInstance;
-
-#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-
diff --git a/testing/backend/cuda/reduce_intervals.cu b/testing/backend/cuda/reduce_intervals.cu
deleted file mode 100644
index a1265b329..000000000
--- a/testing/backend/cuda/reduce_intervals.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-#include <unittest/unittest.h>
-
-#include <thrust/functional.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/system/cuda/detail/reduce_intervals.h>
-
-// CPP reference implementation 
-template <typename InputIterator,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition>
-void reduce_intervals(InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp)
-{
-  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-  typedef typename Decomposition::index_type index_type;
-
-  // wrap binary_op
-  thrust::detail::wrapped_function<
-    BinaryFunction,
-    OutputType
-  > wrapped_binary_op(binary_op);
-
-  for(index_type i = 0; i < decomp.size(); ++i, ++output)
-  {
-    InputIterator begin = input + decomp[i].begin();
-    InputIterator end   = input + decomp[i].end();
-
-    if (begin != end)
-    {
-      OutputType sum = *begin;
-
-      ++begin;
-
-      while (begin != end)
-      {
-        sum = wrapped_binary_op(sum, *begin);
-        ++begin;
-      }
-
-      *output = sum;
-    }
-  }
-}
-
-
-void TestCudaReduceIntervalsSimple(void)
-{
-  typedef int T;
-  typedef thrust::device_vector<T> Vector;
-
-  using thrust::system::cuda::detail::reduce_intervals;
-  using thrust::system::detail::internal::uniform_decomposition;
-
-  Vector input(10, 1);
-    
-  {
-    uniform_decomposition<int> decomp(10, 10, 1);
-    Vector output(decomp.size());
-
-    thrust::cuda::tag cuda_tag;
-    reduce_intervals(cuda_tag, input.begin(), output.begin(), thrust::plus<T>(), decomp);
-
-    ASSERT_EQUAL(output[0], 10);
-  }
-  
-  {
-    uniform_decomposition<int> decomp(10, 6, 2);
-    Vector output(decomp.size());
-
-    thrust::cuda::tag cuda_tag;
-    reduce_intervals(cuda_tag, input.begin(), output.begin(), thrust::plus<T>(), decomp);
-
-    ASSERT_EQUAL(output[0], 6);
-    ASSERT_EQUAL(output[1], 4);
-  }
-}
-DECLARE_UNITTEST(TestCudaReduceIntervalsSimple);
-
-
-template <typename T>
-struct TestCudaReduceIntervals
-{
-  void operator()(const size_t n)
-  {
-    using thrust::system::cuda::detail::reduce_intervals;
-    using thrust::system::detail::internal::uniform_decomposition;
-    
-    thrust::host_vector<T>   h_input = unittest::random_integers<T>(n);
-    thrust::device_vector<T> d_input = h_input;
-
-    uniform_decomposition<size_t> decomp(n, 7, 100);
-
-    thrust::host_vector<T>   h_output(decomp.size());
-    thrust::device_vector<T> d_output(decomp.size());
-    
-    ::reduce_intervals(h_input.begin(), h_output.begin(), thrust::plus<T>(), decomp);
-
-    thrust::cuda::tag cuda_tag;
-    reduce_intervals(cuda_tag, d_input.begin(), d_output.begin(), thrust::plus<T>(), decomp);
-
-    ASSERT_EQUAL(h_output, d_output);
-  }
-};
-VariableUnitTest<TestCudaReduceIntervals, IntegralTypes> TestCudaReduceIntervalsInstance;
-
diff --git a/testing/binary_search.cu b/testing/binary_search.cu
index ee27879db..2aceb8645 100644
--- a/testing/binary_search.cu
+++ b/testing/binary_search.cu
@@ -5,7 +5,7 @@
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 //////////////////////
 // Scalar Functions //
@@ -14,8 +14,6 @@ __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 template <class Vector>
 void TestScalarLowerBoundSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -39,7 +37,7 @@ DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundSimple);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(my_system &system, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+ForwardIterator lower_bound(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     system.validate_dispatch();
     return first;
@@ -61,7 +59,7 @@ DECLARE_UNITTEST(TestScalarLowerBoundDispatchExplicit);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(my_tag, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+ForwardIterator lower_bound(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     *first = 13;
     return first;
@@ -84,8 +82,6 @@ DECLARE_UNITTEST(TestScalarLowerBoundDispatchImplicit);
 template <class Vector>
 void TestScalarUpperBoundSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -109,7 +105,7 @@ DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundSimple);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(my_system &system, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+ForwardIterator upper_bound(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     system.validate_dispatch();
     return first;
@@ -131,7 +127,7 @@ DECLARE_UNITTEST(TestScalarUpperBoundDispatchExplicit);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(my_tag, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+ForwardIterator upper_bound(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     *first = 13;
     return first;
@@ -153,8 +149,6 @@ DECLARE_UNITTEST(TestScalarUpperBoundDispatchImplicit);
 template <class Vector>
 void TestScalarBinarySearchSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -178,7 +172,7 @@ DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchSimple);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-bool binary_search(my_system &system, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+bool binary_search(my_system &system, ForwardIterator /*first*/, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     system.validate_dispatch();
     return false;
@@ -200,7 +194,7 @@ DECLARE_UNITTEST(TestScalarBinarySearchDispatchExplicit);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-bool binary_search(my_tag, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+bool binary_search(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     *first = 13;
     return false;
@@ -222,8 +216,6 @@ DECLARE_UNITTEST(TestScalarBinarySearchDispatchImplicit);
 template <class Vector>
 void TestScalarEqualRangeSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -258,7 +250,7 @@ DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeSimple);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_system &system, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_system &system, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     system.validate_dispatch();
     return thrust::make_pair(first,first);
@@ -280,7 +272,7 @@ DECLARE_UNITTEST(TestScalarEqualRangeDispatchExplicit);
 
 
 template<typename ForwardIterator, typename LessThanComparable>
-thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_tag, ForwardIterator first, ForwardIterator last, const LessThanComparable &value)
+thrust::pair<ForwardIterator,ForwardIterator> equal_range(my_tag, ForwardIterator first, ForwardIterator /*last*/, const LessThanComparable &/*value*/)
 {
     *first = 13;
     return thrust::make_pair(first,first);
@@ -298,4 +290,58 @@ void TestScalarEqualRangeDispatchImplicit()
 }
 DECLARE_UNITTEST(TestScalarEqualRangeDispatchImplicit);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+void TestBoundsWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::lower_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+
+    distance_low_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    distance_high_value = thrust::distance(
+        begin,
+        thrust::upper_bound(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 17);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 17);
+}
+
+void TestBoundsWithBigIndexes()
+{
+    TestBoundsWithBigIndexesHelper(30);
+    TestBoundsWithBigIndexesHelper(31);
+    TestBoundsWithBigIndexesHelper(32);
+    TestBoundsWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestBoundsWithBigIndexes);
diff --git a/testing/binary_search_descending.cu b/testing/binary_search_descending.cu
index 48e44ecbc..08294c044 100644
--- a/testing/binary_search_descending.cu
+++ b/testing/binary_search_descending.cu
@@ -22,16 +22,16 @@ void TestScalarLowerBoundDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 0, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), 1, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 2, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 3, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), 4, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 5, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), 6, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), 7, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 8, thrust::greater<T>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), 9, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::lower_bound(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::lower_bound(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::lower_bound(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::lower_bound(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::lower_bound(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarLowerBoundDescendingSimple);
 
@@ -49,16 +49,16 @@ void TestScalarUpperBoundDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), 0, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 1, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), 2, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 3, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 4, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), 5, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 6, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), 7, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), 8, thrust::greater<int>()));
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), 9, thrust::greater<int>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::upper_bound(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::upper_bound(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::upper_bound(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::upper_bound(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::upper_bound(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::upper_bound(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarUpperBoundDescendingSimple);
 
@@ -76,16 +76,16 @@ void TestScalarBinarySearchDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 0, thrust::greater<int>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 1, thrust::greater<int>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 2, thrust::greater<int>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 3, thrust::greater<int>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 4, thrust::greater<int>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 5, thrust::greater<int>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 6, thrust::greater<int>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 7, thrust::greater<int>()));
-    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), 8, thrust::greater<int>()));
-    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), 9, thrust::greater<int>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{0}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{1}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{2}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{3}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{4}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{5}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{6}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{7}, thrust::greater<T>()));
+    ASSERT_EQUAL(true,  thrust::binary_search(vec.begin(), vec.end(), T{8}, thrust::greater<T>()));
+    ASSERT_EQUAL(false, thrust::binary_search(vec.begin(), vec.end(), T{9}, thrust::greater<T>()));
 }
 DECLARE_VECTOR_UNITTEST(TestScalarBinarySearchDescendingSimple);
 
@@ -103,27 +103,27 @@ void TestScalarEqualRangeDescendingSimple(void)
     vec[3] = 2;
     vec[4] = 0;
 
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<int>()).first);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<int>()).first);
-    
-    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), 0, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 1, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), 2, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 3, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 4, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), 5, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 6, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), 7, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), 8, thrust::greater<int>()).second);
-    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), 9, thrust::greater<int>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater<T>()).first);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater<T>()).first);
+
+    ASSERT_EQUAL_QUIET(vec.begin() + 5, thrust::equal_range(vec.begin(), vec.end(), T{0}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{1}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 4, thrust::equal_range(vec.begin(), vec.end(), T{2}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{3}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{4}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 3, thrust::equal_range(vec.begin(), vec.end(), T{5}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{6}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 2, thrust::equal_range(vec.begin(), vec.end(), T{7}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 1, thrust::equal_range(vec.begin(), vec.end(), T{8}, thrust::greater<T>()).second);
+    ASSERT_EQUAL_QUIET(vec.begin() + 0, thrust::equal_range(vec.begin(), vec.end(), T{9}, thrust::greater<T>()).second);
 }
 DECLARE_VECTOR_UNITTEST(TestScalarEqualRangeDescendingSimple);
 
diff --git a/testing/binary_search_vector.cu b/testing/binary_search_vector.cu
index 859917275..5e8f8358e 100644
--- a/testing/binary_search_vector.cu
+++ b/testing/binary_search_vector.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/binary_search.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -16,15 +17,14 @@ template <class ExampleVector, typename NewType>
 struct vector_like
 {
     typedef typename ExampleVector::allocator_type alloc;
-    typedef typename alloc::template rebind<NewType>::other new_alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
     typedef thrust::detail::vector_base<NewType, new_alloc> type;
 };
 
 template <class Vector>
 void TestVectorLowerBoundSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -36,7 +36,8 @@ void TestVectorLowerBoundSimple(void)
     Vector input(10);
     thrust::sequence(input.begin(), input.end());
 
-    typedef typename vector_like<Vector, int>::type IntVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
 
     // test with integral output type
     IntVector integral_output(10);
@@ -125,8 +126,6 @@ DECLARE_UNITTEST(TestVectorLowerBoundDispatchImplicit);
 template <class Vector>
 void TestVectorUpperBoundSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -138,7 +137,8 @@ void TestVectorUpperBoundSimple(void)
     Vector input(10);
     thrust::sequence(input.begin(), input.end());
 
-    typedef typename vector_like<Vector, int>::type IntVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
 
     // test with integral output type
     IntVector integral_output(10);
@@ -225,8 +225,6 @@ DECLARE_UNITTEST(TestVectorUpperBoundDispatchImplicit);
 template <class Vector>
 void TestVectorBinarySearchSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 0;
@@ -239,7 +237,8 @@ void TestVectorBinarySearchSimple(void)
     thrust::sequence(input.begin(), input.end());
 
     typedef typename vector_like<Vector, bool>::type BoolVector;
-    typedef typename vector_like<Vector,  int>::type IntVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector,  int_type>::type IntVector;
 
     // test with boolean output type
     BoolVector bool_output(10);
@@ -335,8 +334,9 @@ struct TestVectorLowerBound
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
     thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
     thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
@@ -358,8 +358,9 @@ struct TestVectorUpperBound
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
     thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
     thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
@@ -380,8 +381,9 @@ struct TestVectorBinarySearch
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
     thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin());
     thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin());
diff --git a/testing/binary_search_vector_descending.cu b/testing/binary_search_vector_descending.cu
index b97fecf13..edc70663a 100644
--- a/testing/binary_search_vector_descending.cu
+++ b/testing/binary_search_vector_descending.cu
@@ -2,6 +2,7 @@
 #include <thrust/binary_search.h>
 #include <thrust/functional.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
 
@@ -14,7 +15,8 @@ template <class ExampleVector, typename NewType>
 struct vector_like
 {
     typedef typename ExampleVector::allocator_type alloc;
-    typedef typename alloc::template rebind<NewType>::other new_alloc;
+    typedef typename thrust::detail::allocator_traits<alloc> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<NewType> new_alloc;
     typedef thrust::detail::vector_base<NewType, new_alloc> type;
 };
 
@@ -34,7 +36,8 @@ void TestVectorLowerBoundDescendingSimple(void)
     Vector input(10);
     thrust::sequence(input.begin(), input.end());
 
-    typedef typename vector_like<Vector, int>::type IntVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
 
     // test with integral output type
     IntVector integral_output(10);
@@ -59,8 +62,6 @@ DECLARE_VECTOR_UNITTEST(TestVectorLowerBoundDescendingSimple);
 template <class Vector>
 void TestVectorUpperBoundDescendingSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
 
     vec[0] = 8;
@@ -72,11 +73,13 @@ void TestVectorUpperBoundDescendingSimple(void)
     Vector input(10);
     thrust::sequence(input.begin(), input.end());
 
-    typedef typename vector_like<Vector, int>::type IntVector;
+    typedef typename Vector::difference_type int_type;
+    typedef typename Vector::value_type T;
+    typedef typename vector_like<Vector, int_type>::type IntVector;
 
     // test with integral output type
     IntVector integral_output(10);
-    typename IntVector::iterator output_end = thrust::upper_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<int>());
+    typename IntVector::iterator output_end = thrust::upper_bound(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<T>());
 
     ASSERT_EQUAL_QUIET(output_end, integral_output.end());
 
@@ -97,8 +100,6 @@ DECLARE_VECTOR_UNITTEST(TestVectorUpperBoundDescendingSimple);
 template <class Vector>
 void TestVectorBinarySearchDescendingSimple(void)
 {
-  typedef typename Vector::value_type T;
-
   Vector vec(5);
 
   vec[0] = 8;
@@ -111,11 +112,13 @@ void TestVectorBinarySearchDescendingSimple(void)
   thrust::sequence(input.begin(), input.end());
 
   typedef typename vector_like<Vector, bool>::type BoolVector;
-  typedef typename vector_like<Vector,  int>::type IntVector;
+  typedef typename Vector::difference_type int_type;
+  typedef typename Vector::value_type T;
+  typedef typename vector_like<Vector,  int_type>::type IntVector;
 
   // test with boolean output type
   BoolVector bool_output(10);
-  typename BoolVector::iterator bool_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), bool_output.begin(), thrust::greater<int>());
+  typename BoolVector::iterator bool_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), bool_output.begin(), thrust::greater<T>());
 
   ASSERT_EQUAL_QUIET(bool_output_end, bool_output.end());
 
@@ -132,7 +135,7 @@ void TestVectorBinarySearchDescendingSimple(void)
   
   // test with integral output type
   IntVector integral_output(10, 2);
-  typename IntVector::iterator int_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<int>());
+  typename IntVector::iterator int_output_end = thrust::binary_search(vec.begin(), vec.end(), input.begin(), input.end(), integral_output.begin(), thrust::greater<T>());
 
   ASSERT_EQUAL_QUIET(int_output_end, integral_output.end());
   
@@ -161,11 +164,12 @@ struct TestVectorLowerBoundDescending
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
-    thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<int>());
-    thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<int>());
+    thrust::lower_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::lower_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
 
     ASSERT_EQUAL(h_output, d_output);
   }
@@ -178,17 +182,18 @@ struct TestVectorUpperBoundDescending
 {
   void operator()(const size_t n)
   {
-    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<int>());
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<T>());
     thrust::device_vector<T> d_vec = h_vec;
 
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
-    thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<int>());
-    thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<int>());
+    thrust::upper_bound(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::upper_bound(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
 
     ASSERT_EQUAL(h_output, d_output);
   }
@@ -200,17 +205,18 @@ struct TestVectorBinarySearchDescending
 {
   void operator()(const size_t n)
   {
-    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<int>());
+    thrust::host_vector<T>   h_vec = unittest::random_integers<T>(n); thrust::sort(h_vec.begin(), h_vec.end(), thrust::greater<T>());
     thrust::device_vector<T> d_vec = h_vec;
 
     thrust::host_vector<T>   h_input = unittest::random_integers<T>(2*n);
     thrust::device_vector<T> d_input = h_input;
     
-    thrust::host_vector<int>   h_output(2*n);
-    thrust::device_vector<int> d_output(2*n);
+    typedef typename thrust::host_vector<T>::difference_type int_type;
+    thrust::host_vector<int_type>   h_output(2*n);
+    thrust::device_vector<int_type> d_output(2*n);
 
-    thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<int>());
-    thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<int>());
+    thrust::binary_search(h_vec.begin(), h_vec.end(), h_input.begin(), h_input.end(), h_output.begin(), thrust::greater<T>());
+    thrust::binary_search(d_vec.begin(), d_vec.end(), d_input.begin(), d_input.end(), d_output.begin(), thrust::greater<T>());
 
     ASSERT_EQUAL(h_output, d_output);
   }
diff --git a/testing/caching_allocator.cu b/testing/caching_allocator.cu
new file mode 100644
index 000000000..f98ea336b
--- /dev/null
+++ b/testing/caching_allocator.cu
@@ -0,0 +1,23 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/caching_allocator.h>
+
+template<typename Allocator>
+void test_implementation(Allocator alloc)
+{
+    typedef typename thrust::detail::allocator_traits<Allocator> Traits;
+    typedef typename Allocator::pointer Ptr;
+
+    Ptr p = Traits::allocate(alloc, 123);
+    Traits::deallocate(alloc, p, 123);
+
+    Ptr p2 = Traits::allocate(alloc, 123);
+    ASSERT_EQUAL(p, p2);
+}
+
+void TestSingleDeviceTLSCachingAllocator()
+{
+    test_implementation(thrust::detail::single_device_tls_caching_allocator());
+};
+DECLARE_UNITTEST(TestSingleDeviceTLSCachingAllocator);
diff --git a/testing/cmake/CMakeLists.txt b/testing/cmake/CMakeLists.txt
new file mode 100644
index 000000000..71798de75
--- /dev/null
+++ b/testing/cmake/CMakeLists.txt
@@ -0,0 +1,37 @@
+thrust_update_system_found_flags()
+
+set(extra_cmake_flags)
+
+# Need to pass these when testing NVC++.
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set(extra_cmake_flags
+    -D "CMAKE_CUDA_COMPILER_ID=${CMAKE_CUDA_COMPILER_ID}"
+    -D "CMAKE_CUDA_COMPILER_FORCED=${CMAKE_CUDA_COMPILER_FORCED}"
+  )
+endif()
+
+if (THRUST_CPP_FOUND AND THRUST_CUDA_FOUND)
+  # Test that we can use `find_package` on an installed Thrust:
+  add_test(
+    NAME thrust.test.cmake.test_install
+    COMMAND "${CMAKE_COMMAND}"
+      --log-level=VERBOSE
+      -G "${CMAKE_GENERATOR}"
+      -S "${CMAKE_CURRENT_SOURCE_DIR}/test_install"
+      -B "${CMAKE_CURRENT_BINARY_DIR}/test_install"
+      -D "THRUST_BINARY_DIR=${Thrust_BINARY_DIR}"
+      -D "CMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+      -D "CMAKE_CUDA_COMPILER=${CMAKE_CUDA_COMPILER}"
+      -D "CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}"
+      ${extra_cmake_flags}
+  )
+endif()
+
+# Check source code for issues that can be found by pattern matching:
+add_test(
+  NAME thrust.test.cmake.check_source_files
+  COMMAND
+    "${CMAKE_COMMAND}"
+      -D "Thrust_SOURCE_DIR=${Thrust_SOURCE_DIR}"
+      -P "${CMAKE_CURRENT_LIST_DIR}/check_source_files.cmake"
+)
diff --git a/testing/cmake/check_source_files.cmake b/testing/cmake/check_source_files.cmake
new file mode 100644
index 000000000..900300c67
--- /dev/null
+++ b/testing/cmake/check_source_files.cmake
@@ -0,0 +1,185 @@
+# Check all source files for various issues that can be detected using pattern
+# matching.
+#
+# This is run as a ctest test named `thrust.test.cmake.check_source_files`, or
+# manually with:
+# cmake -D "Thrust_SOURCE_DIR=<thrust project root>" -P check_source_files.cmake
+
+cmake_minimum_required(VERSION 3.15)
+
+function(count_substrings input search_regex output_var)
+  string(REGEX MATCHALL "${search_regex}" matches "${input}")
+  list(LENGTH matches num_matches)
+  set(${output_var} ${num_matches} PARENT_SCOPE)
+endfunction()
+
+set(found_errors 0)
+file(GLOB_RECURSE thrust_srcs
+  RELATIVE "${Thrust_SOURCE_DIR}"
+  "${Thrust_SOURCE_DIR}/thrust/*.h"
+  "${Thrust_SOURCE_DIR}/thrust/*.inl"
+)
+
+################################################################################
+# Namespace checks.
+# Check all files in thrust to make sure that they use
+# THRUST_NAMESPACE_BEGIN/END instead of bare `namespace thrust {}` declarations.
+set(namespace_exclusions
+  # This defines the macros and must have bare namespace declarations:
+  thrust/detail/config/namespace.h
+)
+
+set(bare_ns_regex "namespace[ \n\r\t]+thrust[ \n\r\t]*\\{")
+
+# Validation check for the above regex:
+count_substrings([=[
+namespace thrust{
+namespace thrust {
+namespace  thrust  {
+ namespace thrust {
+namespace thrust
+{
+namespace
+thrust
+{
+]=]
+  ${bare_ns_regex} valid_count)
+if (NOT valid_count EQUAL 6)
+  message(FATAL_ERROR "Validation of bare namespace regex failed: "
+                      "Matched ${valid_count} times, expected 6.")
+endif()
+
+################################################################################
+# stdpar header checks.
+# Check all files in Thrust to make sure that they aren't including <algorithm>
+# or <memory>, both of which will cause circular dependencies in nvc++'s
+# stdpar library.
+#
+# The headers following headers should be used instead:
+# <algorithm> -> <thrust/detail/algorithm_wrapper.h>
+# <memory>    -> <thrust/detail/memory_wrapper.h>
+#
+set(stdpar_header_exclusions
+  # The wrappers are allowed to include the unwrapped headers
+  thrust/detail/algorithm_wrapper.h
+  thrust/detail/memory_wrapper.h
+  thrust/detail/numeric_wrapper.h
+)
+
+set(algorithm_regex "#[ \t]*include[ \t]+<algorithm>")
+set(memory_regex    "#[ \t]*include[ \t]+<memory>")
+set(numeric_regex   "#[ \t]*include[ \t]+<numeric>")
+
+# Validation check for the above regex pattern:
+count_substrings([=[
+#include <algorithm>
+# include <algorithm>
+#include  <algorithm>
+# include  <algorithm>
+# include  <algorithm> // ...
+]=]
+  ${algorithm_regex} valid_count)
+if (NOT valid_count EQUAL 5)
+  message(FATAL_ERROR "Validation of stdpar header regex failed: "
+    "Matched ${valid_count} times, expected 5.")
+endif()
+
+################################################################################
+# Legacy macro checks.
+# Check all files in Thrust to make sure that they aren't using the legacy
+# CUB_RUNTIME_ENABLED and __THRUST_HAS_CUDART__ macros.
+#
+# These macros depend on __CUDA_ARCH__ and are not compatible with NV_IF_TARGET.
+# They are provided for legacy purposes and should be replaced with
+# [THRUST|CUB]_RDC_ENABLED and NV_IF_TARGET in Thrust/CUB code.
+#
+#
+set(legacy_macro_header_exclusions
+  # This header defines a legacy CUDART macro:
+  thrust/system/cuda/config.h
+)
+
+set(cub_legacy_macro_regex "CUB_RUNTIME_ENABLED")
+set(thrust_legacy_macro_regex "__THRUST_HAS_CUDART__")
+
+################################################################################
+# Read source files:
+foreach(src ${thrust_srcs})
+  file(READ "${Thrust_SOURCE_DIR}/${src}" src_contents)
+
+  if (NOT ${src} IN_LIST namespace_exclusions)
+    count_substrings("${src_contents}" "${bare_ns_regex}" bare_ns_count)
+    count_substrings("${src_contents}" THRUST_NS_PREFIX prefix_count)
+    count_substrings("${src_contents}" THRUST_NS_POSTFIX postfix_count)
+    count_substrings("${src_contents}" THRUST_NAMESPACE_BEGIN begin_count)
+    count_substrings("${src_contents}" THRUST_NAMESPACE_END end_count)
+    count_substrings("${src_contents}" "#include <thrust/detail/config.h>" header_count)
+
+    if (NOT bare_ns_count EQUAL 0)
+      message("'${src}' contains 'namespace thrust {...}'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT prefix_count EQUAL 0)
+      message("'${src}' contains 'THRUST_NS_PREFIX'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT postfix_count EQUAL 0)
+      message("'${src}' contains 'THRUST_NS_POSTFIX'. Replace with THRUST_NAMESPACE macros.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT begin_count EQUAL end_count)
+      message("'${src}' namespace macros are unbalanced:")
+      message(" - THRUST_NAMESPACE_BEGIN occurs ${begin_count} times.")
+      message(" - THRUST_NAMESPACE_END   occurs ${end_count} times.")
+      set(found_errors 1)
+    endif()
+
+    if (begin_count GREATER 0 AND header_count EQUAL 0)
+      message("'${src}' uses Thrust namespace macros, but does not (directly) `#include <thrust/detail/config.h>`.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST stdpar_header_exclusions)
+    count_substrings("${src_contents}" "${algorithm_regex}" algorithm_count)
+    count_substrings("${src_contents}" "${memory_regex}" memory_count)
+    count_substrings("${src_contents}" "${numeric_regex}" numeric_count)
+
+    if (NOT algorithm_count EQUAL 0)
+      message("'${src}' includes the <algorithm> header. Replace with <thrust/detail/algorithm_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT memory_count EQUAL 0)
+      message("'${src}' includes the <memory> header. Replace with <thrust/detail/memory_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT numeric_count EQUAL 0)
+      message("'${src}' includes the <numeric> header. Replace with <thrust/detail/numeric_wrapper.h>.")
+      set(found_errors 1)
+    endif()
+  endif()
+
+  if (NOT ${src} IN_LIST legacy_macro_header_exclusions)
+    count_substrings("${src_contents}" "${thrust_legacy_macro_regex}" thrust_count)
+    count_substrings("${src_contents}" "${cub_legacy_macro_regex}" cub_count)
+
+    if (NOT thrust_count EQUAL 0)
+      message("'${src}' uses __THRUST_HAS_CUDART__. Replace with THRUST_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+
+    if (NOT cub_count EQUAL 0)
+      message("'${src}' uses CUB_RUNTIME_ENABLED. Replace with CUB_RDC_ENABLED and NV_IF_TARGET.")
+      set(found_errors 1)
+    endif()
+  endif()
+endforeach()
+
+if (NOT found_errors EQUAL 0)
+  message(FATAL_ERROR "Errors detected.")
+endif()
diff --git a/testing/cmake/test_install/CMakeLists.txt b/testing/cmake/test_install/CMakeLists.txt
new file mode 100644
index 000000000..30cf8405c
--- /dev/null
+++ b/testing/cmake/test_install/CMakeLists.txt
@@ -0,0 +1,110 @@
+# Test that an installation of the project can be located by find_package() call
+# with appropriate prefix settings.
+#
+# Expects THRUST_BINARY_DIR to be set to an existing thrust build directory.
+
+cmake_minimum_required(VERSION 3.15)
+
+project(ThrustTestInstall CXX CUDA)
+
+# This will eventually get deleted recursively -- keep that in mind if modifying
+set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/install_prefix/")
+
+function(do_manual_install)
+  # Inspired by the VTK-m install tests, we can just glob up all of the
+  # cmake_install.cmake, include (ie. run) them, and they'll effectively
+  # install the project into the current value of CMAKE_INSTALL_PREFIX.
+
+  # Gather all of the install files from Thrust's root:
+  file(GLOB_RECURSE install_files
+    LIST_DIRECTORIES False
+    "${THRUST_BINARY_DIR}/cmake_install.cmake"
+  )
+
+  message(STATUS "Locating install files...")
+  foreach (install_file IN LISTS install_files)
+    message(STATUS "  * ${install_file}")
+  endforeach()
+
+  message(STATUS "Building install tree...")
+  foreach(install_file IN LISTS install_files)
+    include("${install_file}")
+  endforeach()
+endfunction()
+
+function(do_cleanup)
+  message(STATUS "Removing ${CMAKE_INSTALL_PREFIX}")
+  file(REMOVE_RECURSE "${CMAKE_INSTALL_PREFIX}")
+endfunction()
+
+function(assert_boolean var_name expect)
+  if (expect)
+    if (NOT ${var_name})
+      message(FATAL_ERROR "'${var_name}' is false, expected true.")
+    endif()
+  else()
+    if (${var_name})
+      message(FATAL_ERROR "'${var_name}' is true, expected false.")
+    endif()
+  endif()
+endfunction()
+
+function(assert_target target_name)
+  if (NOT TARGET "${target_name}")
+    message(FATAL_ERROR "Target '${target_name}' not defined.")
+  endif()
+endfunction()
+
+function(find_installed_project)
+  set(CMAKE_PREFIX_PATH "${CMAKE_INSTALL_PREFIX}")
+  find_package(Thrust CONFIG COMPONENTS CPP CUDA)
+
+  if (NOT Thrust_FOUND)
+    message(FATAL_ERROR
+      "find_package(Thrust) failed. "
+      "CMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}"
+    )
+  endif()
+
+  # Test some internal config vars to check that this is the expected install:
+  # TODO The cmake_path (3.19) command will provide more robust ways to do this
+
+  # Escape regex special characters in the install prefix, see
+  # https://gitlab.kitware.com/cmake/cmake/-/issues/18580
+  string(REGEX REPLACE "([][+.*()^])" "\\\\\\1"
+    prefix_regex
+    "${CMAKE_INSTALL_PREFIX}"
+  )
+  if (NOT _THRUST_INCLUDE_DIR MATCHES "^${prefix_regex}")
+    message(FATAL_ERROR
+      "Found Thrust in unexpected location: "
+      " * _THRUST_INCLUDE_DIR=${_THRUST_INCLUDE_DIR} "
+      " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
+    )
+  endif()
+  if (NOT _CUB_INCLUDE_DIR MATCHES "^${prefix_regex}")
+    message(FATAL_ERROR
+      "Found CUB in unexpected location: "
+      " * _CUB_INCLUDE_DIR=${_CUB_INCLUDE_DIR} "
+      " * ExpectedPrefix=${CMAKE_INSTALL_DIR}"
+    )
+  endif()
+
+  thrust_create_target(Thrust)
+  assert_target(Thrust)
+  assert_target(CUB::CUB)
+  assert_target(Thrust::CPP::Host)
+  assert_target(Thrust::CUDA::Device)
+
+  thrust_update_system_found_flags()
+  assert_boolean(THRUST_CPP_FOUND TRUE)
+  assert_boolean(THRUST_CUDA_FOUND TRUE)
+  assert_boolean(THRUST_OMP_FOUND FALSE)
+  assert_boolean(THRUST_TBB_FOUND FALSE)
+
+endfunction()
+
+do_cleanup() # Prepare for new installation
+do_manual_install()
+find_installed_project()
+do_cleanup() # Clean up if successful
diff --git a/testing/complex.cu b/testing/complex.cu
index eb114215d..cf980962a 100644
--- a/testing/complex.cu
+++ b/testing/complex.cu
@@ -1,6 +1,8 @@
 #include <unittest/unittest.h>
 
 #include <thrust/complex.h>
+#include <thrust/detail/config.h>
+
 #include <complex>
 #include <iostream>
 #include <sstream>
@@ -11,6 +13,28 @@
    and takes a lot of time to run.   
  */
 
+template<typename T>
+struct TestComplexSizeAndAlignment
+{
+  void operator()()
+  {
+    THRUST_STATIC_ASSERT(
+      sizeof(thrust::complex<T>) == sizeof(T) * 2
+    );
+    THRUST_STATIC_ASSERT(
+      THRUST_ALIGNOF(thrust::complex<T>) == THRUST_ALIGNOF(T) * 2
+    );
+
+    THRUST_STATIC_ASSERT(
+      sizeof(thrust::complex<T const>) == sizeof(T) * 2
+    );
+    THRUST_STATIC_ASSERT(
+      THRUST_ALIGNOF(thrust::complex<T const>) == THRUST_ALIGNOF(T) * 2
+    );
+  }
+};
+SimpleUnitTest<TestComplexSizeAndAlignment, FloatingPointTypes> TestComplexSizeAndAlignmentInstance;
+
 template<typename T>
 struct TestComplexConstructors
 {
@@ -30,16 +54,16 @@ struct TestComplexConstructors
     a = thrust::complex<T>();
     ASSERT_ALMOST_EQUAL(a,std::complex<T>(0));
     
-    a = thrust::complex<T>(thrust::complex<float>(data[0],data[1]));
+    a = thrust::complex<T>(thrust::complex<float>(static_cast<float>(data[0]),static_cast<float>(data[1])));
     ASSERT_ALMOST_EQUAL(a,b);
     
-    a = thrust::complex<T>(thrust::complex<double>(data[0],data[1]));
+    a = thrust::complex<T>(thrust::complex<double>(static_cast<double>(data[0]),static_cast<double>(data[1])));
     ASSERT_ALMOST_EQUAL(a,b);
     
-    a = thrust::complex<T>(std::complex<float>(data[0],data[1]));
+    a = thrust::complex<T>(std::complex<float>(static_cast<float>(data[0]),static_cast<float>(data[1])));
     ASSERT_ALMOST_EQUAL(a,b);
     
-    a = thrust::complex<T>(std::complex<double>(data[0],data[1]));
+    a = thrust::complex<T>(std::complex<double>(static_cast<double>(data[0]),static_cast<double>(data[1])));
     ASSERT_ALMOST_EQUAL(a,b);
   }
 };
@@ -251,7 +275,7 @@ struct TestComplexTrigonometricFunctions
     ASSERT_ALMOST_EQUAL(sinh(a),sinh(c));
     ASSERT_ALMOST_EQUAL(tanh(a),tanh(c));
 
-#if __cplusplus >= 201103L
+#if THRUST_CPP_DIALECT >= 2011
 
     ASSERT_ALMOST_EQUAL(acos(a),acos(c));
     ASSERT_ALMOST_EQUAL(asin(a),asin(c));
@@ -282,5 +306,29 @@ struct TestComplexStreamOperators
     ASSERT_ALMOST_EQUAL(a,b);
   }
 };
-
 SimpleUnitTest<TestComplexStreamOperators, FloatingPointTypes> TestComplexStreamOperatorsInstance;
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T>
+struct TestComplexStdComplexDeviceInterop
+{
+  void operator()()
+  {
+    thrust::host_vector<T> data = unittest::random_samples<T>(6);
+    std::vector<std::complex<T> > vec(10);
+    vec[0] = std::complex<T>(data[0], data[1]);
+    vec[1] = std::complex<T>(data[2], data[3]);
+    vec[2] = std::complex<T>(data[4], data[5]);
+
+    thrust::device_vector<thrust::complex<T> > device_vec = vec;
+    ASSERT_ALMOST_EQUAL(vec[0].real(), thrust::complex<T>(device_vec[0]).real());
+    ASSERT_ALMOST_EQUAL(vec[0].imag(), thrust::complex<T>(device_vec[0]).imag());
+    ASSERT_ALMOST_EQUAL(vec[1].real(), thrust::complex<T>(device_vec[1]).real());
+    ASSERT_ALMOST_EQUAL(vec[1].imag(), thrust::complex<T>(device_vec[1]).imag());
+    ASSERT_ALMOST_EQUAL(vec[2].real(), thrust::complex<T>(device_vec[2]).real());
+    ASSERT_ALMOST_EQUAL(vec[2].imag(), thrust::complex<T>(device_vec[2]).imag());
+  }
+};
+SimpleUnitTest<TestComplexStdComplexDeviceInterop, FloatingPointTypes> TestComplexStdComplexDeviceInteropInstance;
+#endif
+
diff --git a/testing/complex_transform.cu b/testing/complex_transform.cu
index c70c4cd6a..439597a0d 100644
--- a/testing/complex_transform.cu
+++ b/testing/complex_transform.cu
@@ -5,7 +5,7 @@
 #include <iostream>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include <backend/cuda/testframework.h>
+#include <unittest/cuda/testframework.h>
 #endif
 
 struct basic_arithmetic_functor
@@ -235,15 +235,6 @@ struct TestComplexArithmeticTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_p2 = h_p2;
     thrust::device_vector<type> d_result(n);
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
 
     thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), basic_arithmetic_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), basic_arithmetic_functor());    
@@ -264,16 +255,6 @@ struct TestComplexPlaneTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), complex_plane_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), complex_plane_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
@@ -296,16 +277,6 @@ struct TestComplexPowerTransform
     thrust::device_vector<type> d_p2 = h_p2;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_p2.begin(), h_result.begin(), pow_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_p2.begin(), d_result.begin(), pow_functor());    
     // pow can be very innacurate there's no point trying to check for equality
@@ -331,16 +302,6 @@ struct TestComplexExponentialTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), exp_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), exp_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
@@ -368,15 +329,6 @@ struct TestComplexTrigonometricTransform
     thrust::device_vector<type> d_p1 = h_p1;
     thrust::device_vector<type> d_result(n);
 
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(double) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
 
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), sin_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), sin_functor());    
@@ -404,7 +356,6 @@ struct TestComplexTrigonometricTransform
     ASSERT_ALMOST_EQUAL(h_result, d_result);
 
 
-
     thrust::transform(h_p1.begin(), h_p1.end(), h_result.begin(), asin_functor());
     thrust::transform(d_p1.begin(), d_p1.end(), d_result.begin(), asin_functor());    
     ASSERT_ALMOST_EQUAL(h_result, d_result);
diff --git a/testing/constant_iterator.cu b/testing/constant_iterator.cu
index e909d71e9..e42cfea8d 100644
--- a/testing/constant_iterator.cu
+++ b/testing/constant_iterator.cu
@@ -46,6 +46,17 @@ void TestConstantIteratorIncrement(void)
 }
 DECLARE_UNITTEST(TestConstantIteratorIncrement);
 
+void TestConstantIteratorIncrementBig(void)
+{
+    long long int n = 10000000000ULL;
+
+    thrust::constant_iterator<long long int> begin(1);
+    thrust::constant_iterator<long long int> end = begin + n;
+
+    ASSERT_EQUAL(thrust::distance(begin, end), n);
+}
+DECLARE_UNITTEST(TestConstantIteratorIncrementBig);
+
 void TestConstantIteratorComparison(void)
 {
     using namespace thrust;
@@ -85,7 +96,7 @@ void TestMakeConstantIterator(void)
     ASSERT_EQUAL(13, *iter0);
 
     // test two argument version
-    constant_iterator<int,int> iter1 = make_constant_iterator<int,int>(13, 7);
+    constant_iterator<int,thrust::detail::intmax_t> iter1 = make_constant_iterator<int,thrust::detail::intmax_t>(13, 7);
 
     ASSERT_EQUAL(13, *iter1);
     ASSERT_EQUAL(7, iter1 - iter0);
@@ -98,12 +109,12 @@ void TestConstantIteratorCopy(void)
 {
   using namespace thrust;
 
-  typedef typename Vector::value_type T;
-  typedef constant_iterator<int> ConstIter;
+  using ValueType = typename Vector::value_type;
+  using ConstIter = constant_iterator<ValueType>;
 
   Vector result(4);
 
-  ConstIter first = make_constant_iterator<int>(7);
+  ConstIter first = make_constant_iterator<ValueType>(7);
   ConstIter last  = first + result.size();
   thrust::copy(first, last, result.begin());
 
diff --git a/testing/copy.cu b/testing/copy.cu
index 3759524d4..661e379a2 100644
--- a/testing/copy.cu
+++ b/testing/copy.cu
@@ -1,14 +1,19 @@
 #include <unittest/unittest.h>
 #include <thrust/copy.h>
 
+#include <array>
+#include <algorithm>
 #include <list>
 #include <iterator>
+#include <thrust/detail/config.h>
 #include <thrust/sequence.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 void TestCopyFromConstIterator(void)
 {
@@ -133,8 +138,6 @@ DECLARE_VECTOR_UNITTEST(TestCopyMatchingTypes);
 template <class Vector>
 void TestCopyMixedTypes(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(5);
     v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
 
@@ -159,7 +162,7 @@ void TestCopyMixedTypes(void)
     ASSERT_EQUAL(d[4], 4);
     ASSERT_EQUAL_QUIET(d_result, d.end());
 }
-DECLARE_VECTOR_UNITTEST(TestCopyMixedTypes);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyMixedTypes);
 
 
 void TestCopyVectorBool(void)
@@ -169,7 +172,7 @@ void TestCopyVectorBool(void)
 
     thrust::host_vector<bool> h(3);
     thrust::device_vector<bool> d(3);
-    
+
     thrust::copy(v.begin(), v.end(), h.begin());
     thrust::copy(v.begin(), v.end(), d.begin());
 
@@ -196,30 +199,30 @@ void TestCopyListTo(void)
     l.push_back(2);
     l.push_back(3);
     l.push_back(4);
-   
+
     Vector v(l.size());
 
     typename Vector::iterator v_result = thrust::copy(l.begin(), l.end(), v.begin());
 
-    ASSERT_EQUAL(v[0], 0);
-    ASSERT_EQUAL(v[1], 1);
-    ASSERT_EQUAL(v[2], 2);
-    ASSERT_EQUAL(v[3], 3);
-    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL(v[0], T(0));
+    ASSERT_EQUAL(v[1], T(1));
+    ASSERT_EQUAL(v[2], T(2));
+    ASSERT_EQUAL(v[3], T(3));
+    ASSERT_EQUAL(v[4], T(4));
     ASSERT_EQUAL_QUIET(v_result, v.end());
 
     l.clear();
 
     thrust::copy(v.begin(), v.end(), std::back_insert_iterator< std::list<T> >(l));
 
-    ASSERT_EQUAL(l.size(), 5);
+    ASSERT_EQUAL(l.size(), 5lu);
 
     typename std::list<T>::const_iterator iter = l.begin();
-    ASSERT_EQUAL(*iter, 0);  iter++;
-    ASSERT_EQUAL(*iter, 1);  iter++;
-    ASSERT_EQUAL(*iter, 2);  iter++;
-    ASSERT_EQUAL(*iter, 3);  iter++;
-    ASSERT_EQUAL(*iter, 4);  iter++;
+    ASSERT_EQUAL(*iter, T(0));  iter++;
+    ASSERT_EQUAL(*iter, T(1));  iter++;
+    ASSERT_EQUAL(*iter, T(2));  iter++;
+    ASSERT_EQUAL(*iter, T(3));  iter++;
+    ASSERT_EQUAL(*iter, T(4));  iter++;
 }
 DECLARE_VECTOR_UNITTEST(TestCopyListTo);
 
@@ -228,7 +231,7 @@ template<typename T>
 struct is_even
 {
     __host__ __device__
-    bool operator()(T x) { return (static_cast<unsigned int>(x) & 1) == 0; }
+    bool operator()(T x) { return (x & 1) == 0; }
 };
 
 template<typename T>
@@ -242,10 +245,9 @@ template<typename T>
 struct mod_3
 {
     __host__ __device__
-    unsigned int operator()(T x) { return static_cast<unsigned int>(x) % 3; }
+    unsigned int operator()(T x) { return x % 3; }
 };
-    
-    
+
 
 template <class Vector>
 void TestCopyIfSimple(void)
@@ -255,13 +257,14 @@ void TestCopyIfSimple(void)
     Vector v(5);
     v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
 
-    Vector dest(3);
+    Vector dest(4);
 
-    typename Vector::iterator dest_end = thrust::copy_if(v.begin(), v.end(), dest.begin(), is_even<T>());
+    typename Vector::iterator dest_end = thrust::copy_if(v.begin(), v.end(), dest.begin(), is_true<T>());
 
-    ASSERT_EQUAL(0, dest[0]);
+    ASSERT_EQUAL(1, dest[0]);
     ASSERT_EQUAL(2, dest[1]);
-    ASSERT_EQUAL(4, dest[2]);
+    ASSERT_EQUAL(3, dest[2]);
+    ASSERT_EQUAL(4, dest[3]);
     ASSERT_EQUAL_QUIET(dest.end(), dest_end);
 }
 DECLARE_VECTOR_UNITTEST(TestCopyIfSimple);
@@ -276,6 +279,71 @@ void TestCopyIf(const size_t n)
     typename thrust::host_vector<T>::iterator   h_new_end;
     typename thrust::device_vector<T>::iterator d_new_end;
 
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIf);
+
+
+template <typename T>
+void TestCopyIfIntegral(const size_t n)
+{
+    thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
+    thrust::device_vector<T> d_data = h_data;
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
+    // test with Predicate that returns a bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_even<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+
+    // test with Predicate that returns a non-bool
+    {
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), mod_3<T>());
+
+        h_result.resize(h_new_end - h_result.begin());
+        d_result.resize(d_new_end - d_result.begin());
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfIntegral);
+
+
+template <typename T>
+void TestCopyIfSequence(const size_t n)
+{
+    thrust::host_vector<T>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
+    thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end());
+
+    typename thrust::host_vector<T>::iterator   h_new_end;
+    typename thrust::device_vector<T>::iterator d_new_end;
+
     // test with Predicate that returns a bool
     {
         thrust::host_vector<T>   h_result(n);
@@ -289,7 +357,7 @@ void TestCopyIf(const size_t n)
 
         ASSERT_EQUAL(h_result, d_result);
     }
-    
+
     // test with Predicate that returns a non-bool
     {
         thrust::host_vector<T>   h_result(n);
@@ -304,7 +372,7 @@ void TestCopyIf(const size_t n)
         ASSERT_EQUAL(h_result, d_result);
     }
 }
-DECLARE_VARIABLE_UNITTEST(TestCopyIf);
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfSequence);
 
 
 template <class Vector>
@@ -334,46 +402,124 @@ template <typename T>
 void TestCopyIfStencil(const size_t n)
 {
     thrust::host_vector<T>   h_data(n); thrust::sequence(h_data.begin(), h_data.end());
-    thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end()); 
+    thrust::device_vector<T> d_data(n); thrust::sequence(d_data.begin(), d_data.end());
 
     thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
     thrust::device_vector<T> d_stencil = unittest::random_integers<T>(n);
 
-    thrust::host_vector<T>   h_result(n);
-    thrust::device_vector<T> d_result(n);
-
     typename thrust::host_vector<T>::iterator   h_new_end;
     typename thrust::device_vector<T>::iterator d_new_end;
 
-    // test with Predicate that returns a bool
     {
         thrust::host_vector<T>   h_result(n);
         thrust::device_vector<T> d_result(n);
 
-        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<T>());
-        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_even<T>());
+        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_even<T>());
+        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_even<T>());
 
         h_result.resize(h_new_end - h_result.begin());
         d_result.resize(d_new_end - d_result.begin());
 
         ASSERT_EQUAL(h_result, d_result);
     }
-    
-    // test with Predicate that returns a non-bool
-    {
-        thrust::host_vector<T>   h_result(n);
-        thrust::device_vector<T> d_result(n);
 
-        h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<T>());
-        d_new_end = thrust::copy_if(d_data.begin(), d_data.end(), d_result.begin(), mod_3<T>());
+}
+DECLARE_INTEGRAL_VARIABLE_UNITTEST(TestCopyIfStencil);
 
-        h_result.resize(h_new_end - h_result.begin());
-        d_result.resize(d_new_end - d_result.begin());
+namespace
+{
 
-        ASSERT_EQUAL(h_result, d_result);
+struct object_with_non_trivial_ctor
+{
+  // This struct will only properly assign if its `magic` member is
+  // set to this certain number.
+  static constexpr int MAGIC = 923390;
+
+  int field;
+  int magic;
+
+  __host__ __device__ object_with_non_trivial_ctor()
+  {
+    magic = MAGIC;
+    field = 0;
+  }
+  __host__ __device__ object_with_non_trivial_ctor(int f)
+  {
+    magic = MAGIC;
+    field = f;
+  }
+
+  object_with_non_trivial_ctor(const object_with_non_trivial_ctor& x) = default;
+
+  // This non-trivial assignment requires that `this` points to initialized
+  // memory
+  __host__ __device__ object_with_non_trivial_ctor&
+  operator=(const object_with_non_trivial_ctor& x)
+  {
+    // To really copy over x's field value, require we have magic value set.
+    // If copy_if copies to uninitialized bits, the field will rarely be 923390.
+    if (magic == MAGIC)
+    {
+      field = x.field;
     }
+    return *this;
+  }
+};
+
+struct always_true
+{
+  __host__ __device__
+  bool operator()(const object_with_non_trivial_ctor&)
+  {
+    return true;
+  };
+};
+
+} // end anon namespace
+
+void TestCopyIfNonTrivial()
+{
+  // Attempting to copy an object_with_non_trivial_ctor into uninitialized
+  // memory will fail:
+  {
+    static constexpr size_t BufferAlign = alignof(object_with_non_trivial_ctor);
+    static constexpr size_t BufferSize = sizeof(object_with_non_trivial_ctor);
+    alignas(BufferAlign) std::array<unsigned char, BufferSize> buffer;
+
+    // Fill buffer with 0s to prevent warnings about uninitialized reads while
+    // ensure that the 'magic number' mechanism works as intended:
+    std::fill(buffer.begin(), buffer.end(), 0);
+
+    object_with_non_trivial_ctor initialized;
+    object_with_non_trivial_ctor *uninitialized =
+      reinterpret_cast<object_with_non_trivial_ctor*>(buffer.data());
+
+    object_with_non_trivial_ctor source(42);
+    initialized = source;
+    *uninitialized = source;
+
+    ASSERT_EQUAL(42, initialized.field);
+    ASSERT_NOT_EQUAL(42, uninitialized->field);
+  }
+
+  // This test ensures that we use placement new instead of assigning
+  // to uninitialized memory. See Thrust Github issue #1153.
+  thrust::device_vector<object_with_non_trivial_ctor> a(10, object_with_non_trivial_ctor(99));
+  thrust::device_vector<object_with_non_trivial_ctor> b(10);
+
+  thrust::copy_if(a.begin(), a.end(), b.begin(), always_true());
+
+  for (int i = 0; i < 10; i++)
+  {
+    object_with_non_trivial_ctor ha(a[i]);
+    object_with_non_trivial_ctor hb(b[i]);
+    int ia = ha.field;
+    int ib = hb.field;
+
+    ASSERT_EQUAL(ia, ib);
+  }
 }
-DECLARE_VARIABLE_UNITTEST(TestCopyIfStencil);
+DECLARE_UNITTEST(TestCopyIfNonTrivial);
 
 template <typename Vector>
 void TestCopyCountingIterator(void)
@@ -391,7 +537,7 @@ void TestCopyCountingIterator(void)
     ASSERT_EQUAL(vec[2], 3);
     ASSERT_EQUAL(vec[3], 4);
 }
-DECLARE_VECTOR_UNITTEST(TestCopyCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyCountingIterator);
 
 template <typename Vector>
 void TestCopyZipIterator(void)
@@ -399,7 +545,7 @@ void TestCopyZipIterator(void)
     typedef typename Vector::value_type T;
 
     Vector v1(3); v1[0] = 1; v1[1] = 2; v1[2] = 3;
-    Vector v2(3); v2[0] = 4; v2[1] = 5; v2[2] = 6; 
+    Vector v2(3); v2[0] = 4; v2[1] = 5; v2[2] = 6;
     Vector v3(3, T(0));
     Vector v4(3, T(0));
 
@@ -476,7 +622,7 @@ DECLARE_UNITTEST(TestCopyDispatchImplicit);
 
 
 template<typename InputIterator, typename OutputIterator, typename Predicate>
-OutputIterator copy_if(my_system &system, InputIterator, InputIterator, OutputIterator result, Predicate pred)
+OutputIterator copy_if(my_system &system, InputIterator, InputIterator, OutputIterator result, Predicate)
 {
     system.validate_dispatch();
     return result;
@@ -499,7 +645,7 @@ DECLARE_UNITTEST(TestCopyIfDispatchExplicit);
 
 
 template<typename InputIterator, typename OutputIterator, typename Predicate>
-OutputIterator copy_if(my_tag, InputIterator, InputIterator, OutputIterator result, Predicate pred)
+OutputIterator copy_if(my_tag, InputIterator, InputIterator, OutputIterator result, Predicate)
 {
     *result = 13;
     return result;
@@ -520,7 +666,7 @@ DECLARE_UNITTEST(TestCopyIfDispatchImplicit);
 
 
 template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
-OutputIterator copy_if(my_system &system, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate pred)
+OutputIterator copy_if(my_system &system, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate)
 {
     system.validate_dispatch();
     return result;
@@ -544,7 +690,7 @@ DECLARE_UNITTEST(TestCopyIfStencilDispatchExplicit);
 
 
 template<typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
-OutputIterator copy_if(my_tag, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate pred)
+OutputIterator copy_if(my_tag, InputIterator1, InputIterator1, InputIterator2, OutputIterator result, Predicate)
 {
     *result = 13;
     return result;
@@ -564,3 +710,72 @@ void TestCopyIfStencilDispatchImplicit()
 }
 DECLARE_UNITTEST(TestCopyIfStencilDispatchImplicit);
 
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+=(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+// We need this type to pass as a non-const ref for unary_transform_functor
+// to compile:
+template <>
+struct is_non_const_reference<only_set_when_expected_it> : thrust::true_type {};
+} // end namespace detail
+
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+    typedef thrust::random_access_device_iterator_tag iterator_category;
+};
+THRUST_NAMESPACE_END
+
+void TestCopyWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::copy(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestCopyWithBigIndexes()
+{
+    TestCopyWithBigIndexesHelper(30);
+    TestCopyWithBigIndexesHelper(31);
+    TestCopyWithBigIndexesHelper(32);
+    TestCopyWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCopyWithBigIndexes);
diff --git a/testing/copy_n.cu b/testing/copy_n.cu
index 206984f65..2003b1069 100644
--- a/testing/copy_n.cu
+++ b/testing/copy_n.cu
@@ -96,8 +96,6 @@ DECLARE_VECTOR_UNITTEST(TestCopyNMatchingTypes);
 template <class Vector>
 void TestCopyNMixedTypes(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(5);
     v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
 
@@ -122,7 +120,7 @@ void TestCopyNMixedTypes(void)
     ASSERT_EQUAL(d[4], 4);
     ASSERT_EQUAL_QUIET(d_result, d.end());
 }
-DECLARE_VECTOR_UNITTEST(TestCopyNMixedTypes);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyNMixedTypes);
 
 
 void TestCopyNVectorBool(void)
@@ -164,25 +162,25 @@ void TestCopyNListTo(void)
 
     typename Vector::iterator v_result = thrust::copy_n(l.begin(), l.size(), v.begin());
 
-    ASSERT_EQUAL(v[0], 0);
-    ASSERT_EQUAL(v[1], 1);
-    ASSERT_EQUAL(v[2], 2);
-    ASSERT_EQUAL(v[3], 3);
-    ASSERT_EQUAL(v[4], 4);
+    ASSERT_EQUAL(v[0], T(0));
+    ASSERT_EQUAL(v[1], T(1));
+    ASSERT_EQUAL(v[2], T(2));
+    ASSERT_EQUAL(v[3], T(3));
+    ASSERT_EQUAL(v[4], T(4));
     ASSERT_EQUAL_QUIET(v_result, v.end());
 
     l.clear();
 
     thrust::copy_n(v.begin(), v.size(), std::back_insert_iterator< std::list<T> >(l));
 
-    ASSERT_EQUAL(l.size(), 5);
+    ASSERT_EQUAL(l.size(), 5lu);
 
     typename std::list<T>::const_iterator iter = l.begin();
-    ASSERT_EQUAL(*iter, 0);  iter++;
-    ASSERT_EQUAL(*iter, 1);  iter++;
-    ASSERT_EQUAL(*iter, 2);  iter++;
-    ASSERT_EQUAL(*iter, 3);  iter++;
-    ASSERT_EQUAL(*iter, 4);  iter++;
+    ASSERT_EQUAL(*iter, T(0));  iter++;
+    ASSERT_EQUAL(*iter, T(1));  iter++;
+    ASSERT_EQUAL(*iter, T(2));  iter++;
+    ASSERT_EQUAL(*iter, T(3));  iter++;
+    ASSERT_EQUAL(*iter, T(4));  iter++;
 }
 DECLARE_VECTOR_UNITTEST(TestCopyNListTo);
 
@@ -198,12 +196,12 @@ void TestCopyNCountingIterator(void)
 
     thrust::copy_n(iter, 4, vec.begin());
 
-    ASSERT_EQUAL(vec[0], 1);
-    ASSERT_EQUAL(vec[1], 2);
-    ASSERT_EQUAL(vec[2], 3);
-    ASSERT_EQUAL(vec[3], 4);
+    ASSERT_EQUAL(vec[0], T(1));
+    ASSERT_EQUAL(vec[1], T(2));
+    ASSERT_EQUAL(vec[2], T(3));
+    ASSERT_EQUAL(vec[3], T(4));
 }
-DECLARE_VECTOR_UNITTEST(TestCopyNCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestCopyNCountingIterator);
 
 template <typename Vector>
 void TestCopyNZipIterator(void)
@@ -229,19 +227,19 @@ void TestCopyNConstantIteratorToZipIterator(void)
 {
     typedef typename Vector::value_type T;
 
-    Vector v1(3,T(0));
-    Vector v2(3,T(0));
+    Vector v1(3, T(0));
+    Vector v2(3, T(0));
 
     thrust::copy_n(thrust::make_constant_iterator(thrust::tuple<T,T>(4,7)),
                    v1.size(),
                    thrust::make_zip_iterator(thrust::make_tuple(v1.begin(),v2.begin())));
 
-    ASSERT_EQUAL(v1[0], 4);
-    ASSERT_EQUAL(v1[1], 4);
-    ASSERT_EQUAL(v1[2], 4);
-    ASSERT_EQUAL(v2[0], 7);
-    ASSERT_EQUAL(v2[1], 7);
-    ASSERT_EQUAL(v2[2], 7);
+    ASSERT_EQUAL(v1[0], T(4));
+    ASSERT_EQUAL(v1[1], T(4));
+    ASSERT_EQUAL(v1[2], T(4));
+    ASSERT_EQUAL(v2[0], T(7));
+    ASSERT_EQUAL(v2[1], T(7));
+    ASSERT_EQUAL(v2[2], T(7));
 };
 DECLARE_VECTOR_UNITTEST(TestCopyNConstantIteratorToZipIterator);
 
diff --git a/testing/count.cu b/testing/count.cu
index 092bc4f05..a6021da79 100644
--- a/testing/count.cu
+++ b/testing/count.cu
@@ -5,8 +5,6 @@
 template <class Vector>
 void TestCountSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data(5);
     data[0] = 1; data[1] = 1; data[2] = 0; data[3] = 0; data[4] = 1;
 
@@ -68,8 +66,6 @@ DECLARE_VARIABLE_UNITTEST(TestCountIf);
 template <typename Vector>
 void TestCountFromConstIteratorSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data(5);
     data[0] = 1; data[1] = 1; data[2] = 0; data[3] = 0; data[4] = 1;
 
@@ -103,7 +99,7 @@ DECLARE_UNITTEST(TestCountDispatchExplicit);
 
 
 template<typename InputIterator, typename EqualityComparable>
-int count(my_tag, InputIterator first, InputIterator, EqualityComparable x)
+int count(my_tag, InputIterator /*first*/, InputIterator, EqualityComparable x)
 {
     return x;
 }
@@ -120,3 +116,22 @@ void TestCountDispatchImplicit()
 }
 DECLARE_UNITTEST(TestCountDispatchImplicit);
 
+void TestCountWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::count(thrust::device, begin, end, (1ll << magnitude) - 17);
+
+    ASSERT_EQUAL(result, 1);
+}
+
+void TestCountWithBigIndexes()
+{
+    TestCountWithBigIndexesHelper(30);
+    TestCountWithBigIndexesHelper(31);
+    TestCountWithBigIndexesHelper(32);
+    TestCountWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestCountWithBigIndexes);
diff --git a/testing/counting_iterator.cu b/testing/counting_iterator.cu
index 8c7c0fec9..ebefe4d64 100644
--- a/testing/counting_iterator.cu
+++ b/testing/counting_iterator.cu
@@ -6,7 +6,15 @@
 #include <thrust/detail/cstdint.h>
 
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+
+template <typename T>
+void TestCountingDefaultConstructor(void)
+{
+  thrust::counting_iterator<T> iter0;
+  ASSERT_EQUAL(*iter0, T{});
+}
+DECLARE_GENERIC_UNITTEST(TestCountingDefaultConstructor);
 
 void TestCountingIteratorCopyConstructor(void)
 {
@@ -221,4 +229,4 @@ void TestCountingIteratorDifference(void)
 }
 DECLARE_UNITTEST(TestCountingIteratorDifference);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/cpp/CMakeLists.txt b/testing/cpp/CMakeLists.txt
new file mode 100644
index 000000000..215b81ee4
--- /dev/null
+++ b/testing/cpp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CPP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cpp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/cpp/adjacent_difference.cu b/testing/cpp/adjacent_difference.cu
new file mode 100644
index 000000000..584899bec
--- /dev/null
+++ b/testing/cpp/adjacent_difference.cu
@@ -0,0 +1,54 @@
+#include <unittest/unittest.h>
+#include <thrust/adjacent_difference.h>
+#include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+
+struct detect_wrong_difference
+{
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/testing/cstdint.cu b/testing/cstdint.cu
index 535d25854..5284955fd 100644
--- a/testing/cstdint.cu
+++ b/testing/cstdint.cu
@@ -5,14 +5,14 @@
 
 void TestStandardIntegerTypes(void)
 {
-  ASSERT_EQUAL(sizeof(thrust::detail::int8_t),   1);
-  ASSERT_EQUAL(sizeof(thrust::detail::int16_t),  2);
-  ASSERT_EQUAL(sizeof(thrust::detail::int32_t),  4);
-  ASSERT_EQUAL(sizeof(thrust::detail::int64_t),  8);
-  ASSERT_EQUAL(sizeof(thrust::detail::uint8_t),  1);
-  ASSERT_EQUAL(sizeof(thrust::detail::uint16_t), 2);
-  ASSERT_EQUAL(sizeof(thrust::detail::uint32_t), 4);
-  ASSERT_EQUAL(sizeof(thrust::detail::uint64_t), 8);
+  ASSERT_EQUAL(sizeof(thrust::detail::int8_t),   1lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int16_t),  2lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int32_t),  4lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::int64_t),  8lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint8_t),  1lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint16_t), 2lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint32_t), 4lu);
+  ASSERT_EQUAL(sizeof(thrust::detail::uint64_t), 8lu);
 
   ASSERT_EQUAL(sizeof(thrust::detail::intptr_t),  sizeof(void *));
   ASSERT_EQUAL(sizeof(thrust::detail::uintptr_t), sizeof(void *));
diff --git a/testing/cuda/CMakeLists.txt b/testing/cuda/CMakeLists.txt
new file mode 100644
index 000000000..8fe4a4be7
--- /dev/null
+++ b/testing/cuda/CMakeLists.txt
@@ -0,0 +1,35 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+# These tests always build with RDC, so make sure that the sm_XX flags are
+# compatible. See note in ThrustCudaConfig.cmake.
+# TODO once we're using CUDA_ARCHITECTURES, we can setup non-rdc fallback
+# tests to build for non-rdc arches. But for now, all files in a given directory
+# must build with the same `CMAKE_CUDA_FLAGS` due to CMake constraints around
+# how CUDA_FLAGS works.
+set(CMAKE_CUDA_FLAGS "${THRUST_CUDA_FLAGS_BASE} ${THRUST_CUDA_FLAGS_RDC}")
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "CUDA")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "cuda.")
+
+    # Create two targets, one with RDC enabled, the other without. This tests
+    # both device-side behaviors -- the CDP kernel launch with RDC, and the
+    # serial fallback path without RDC.
+    thrust_add_test(seq_test_target ${test_name}.cdp_0 "${test_src}" ${thrust_target})
+
+    if (THRUST_ENABLE_TESTS_WITH_RDC)
+      thrust_add_test(cdp_test_target ${test_name}.cdp_1 "${test_src}" ${thrust_target})
+      thrust_enable_rdc_for_cuda_target(${cdp_test_target})
+    endif()
+  endforeach()
+endforeach()
diff --git a/testing/backend/cuda/adjacent_difference.cu b/testing/cuda/adjacent_difference.cu
similarity index 54%
rename from testing/backend/cuda/adjacent_difference.cu
rename to testing/cuda/adjacent_difference.cu
index 6d2c5d253..9b101ea2e 100644
--- a/testing/backend/cuda/adjacent_difference.cu
+++ b/testing/cuda/adjacent_difference.cu
@@ -1,8 +1,11 @@
 #include <unittest/unittest.h>
 #include <thrust/adjacent_difference.h>
 #include <thrust/execution_policy.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__ void adjacent_difference_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
 {
@@ -22,24 +25,36 @@ void TestAdjacentDifferenceDevice(ExecutionPolicy exec, const size_t n)
 {
   thrust::host_vector<T>   h_input = unittest::random_samples<T>(n);
   thrust::device_vector<T> d_input = h_input;
-  
+
   thrust::host_vector<T>   h_output(n);
   thrust::device_vector<T> d_output(n);
-  
+
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
-  
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_output, d_output);
-  
+
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_output.begin(), thrust::plus<T>());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), thrust::plus<T>());
-  
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_output, d_output);
-  
+
   // in-place operation
   thrust::adjacent_difference(h_input.begin(), h_input.end(), h_input.begin(), thrust::plus<T>());
   adjacent_difference_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_input.begin(), thrust::plus<T>());
-  
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_input, h_output); //computed previously
   ASSERT_EQUAL(d_input, d_output); //computed previously
 }
@@ -59,21 +74,22 @@ void TestAdjacentDifferenceDeviceDevice(const size_t n)
   TestAdjacentDifferenceDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestAdjacentDifferenceDeviceDevice);
+#endif
 
 
 void TestAdjacentDifferenceCudaStreams()
 {
   cudaStream_t s;
   cudaStreamCreate(&s);
-  
+
   thrust::device_vector<int> input(3);
   thrust::device_vector<int> output(3);
   input[0] = 1; input[1] = 4; input[2] = 6;
-  
+
   thrust::adjacent_difference(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin());
 
   cudaStreamSynchronize(s);
-  
+
   ASSERT_EQUAL(output[0], 1);
   ASSERT_EQUAL(output[1], 3);
   ASSERT_EQUAL(output[2], 2);
@@ -82,3 +98,57 @@ void TestAdjacentDifferenceCudaStreams()
 }
 DECLARE_UNITTEST(TestAdjacentDifferenceCudaStreams);
 
+struct detect_wrong_difference
+{
+    using difference_type = void;
+    using value_type = void;
+    using pointer = void;
+    using reference = void;
+    using iterator_category = std::output_iterator_tag;
+
+    bool * flag;
+
+    __host__ __device__ detect_wrong_difference operator++() const { return *this; }
+    __host__ __device__ detect_wrong_difference operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ detect_wrong_difference operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ detect_wrong_difference operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long difference) const
+    {
+        if (difference != 1)
+        {
+            *flag = false;
+        }
+    }
+};
+
+void TestAdjacentDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> all_differences_correct = thrust::device_malloc<bool>(1);
+    *all_differences_correct = true;
+
+    detect_wrong_difference out = { thrust::raw_pointer_cast(all_differences_correct) };
+
+    thrust::adjacent_difference(thrust::device, begin, end, out);
+
+    bool all_differences_correct_h = *all_differences_correct;
+    thrust::device_free(all_differences_correct);
+
+    ASSERT_EQUAL(all_differences_correct_h, true);
+}
+
+void TestAdjacentDifferenceWithBigIndexes()
+{
+    TestAdjacentDifferenceWithBigIndexesHelper(30);
+    TestAdjacentDifferenceWithBigIndexesHelper(31);
+    TestAdjacentDifferenceWithBigIndexesHelper(32);
+    TestAdjacentDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestAdjacentDifferenceWithBigIndexes);
diff --git a/testing/cuda/adjacent_difference.mk b/testing/cuda/adjacent_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/adjacent_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/binary_search.cu b/testing/cuda/binary_search.cu
new file mode 100644
index 000000000..58a83f61c
--- /dev/null
+++ b/testing/cuda/binary_search.cu
@@ -0,0 +1,25 @@
+#include <unittest/unittest.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+#include <thrust/sequence.h>
+
+void TestEqualRangeOnStream()
+{ // Regression test for GH issue #921 (nvbug 2173437)
+  typedef typename thrust::device_vector<int> vector_t;
+  typedef typename vector_t::iterator iterator_t;
+  typedef thrust::pair<iterator_t, iterator_t> result_t;
+
+  vector_t input(10);
+  thrust::sequence(thrust::device, input.begin(), input.end(), 0);
+  cudaStream_t stream = 0;
+  result_t result = thrust::equal_range(thrust::cuda::par.on(stream),
+                                        input.begin(), input.end(),
+                                        5);
+
+  ASSERT_EQUAL(5, thrust::distance(input.begin(), result.first));
+  ASSERT_EQUAL(6, thrust::distance(input.begin(), result.second));
+}
+DECLARE_UNITTEST(TestEqualRangeOnStream);
diff --git a/testing/cuda/binary_search.mk b/testing/cuda/binary_search.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/binary_search.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/complex.cu b/testing/cuda/complex.cu
new file mode 100644
index 000000000..8034541ff
--- /dev/null
+++ b/testing/cuda/complex.cu
@@ -0,0 +1,53 @@
+#include <unittest/unittest.h>
+
+#include <thrust/complex.h>
+#include <thrust/detail/preprocessor.h>
+#include <thrust/detail/alignment.h>
+
+#include <cuda_fp16.h>
+
+template <typename T, typename VectorT>
+void TestComplexAlignment()
+{
+  THRUST_STATIC_ASSERT(
+    sizeof(thrust::complex<T>) == sizeof(VectorT)
+  );
+  THRUST_STATIC_ASSERT(
+    THRUST_ALIGNOF(thrust::complex<T>) == THRUST_ALIGNOF(VectorT)
+  );
+
+  THRUST_STATIC_ASSERT(
+    sizeof(thrust::complex<T const>) == sizeof(VectorT)
+  );
+  THRUST_STATIC_ASSERT(
+    THRUST_ALIGNOF(thrust::complex<T const>) == THRUST_ALIGNOF(VectorT)
+  );
+}
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<char, char2>)
+, TestComplexCharAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<short, short2>)
+, TestComplexShortAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<int, int2>)
+, TestComplexIntAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<long, long2>)
+, TestComplexLongAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<__half, __half2>)
+, TestComplexHalfAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<float, float2>)
+, TestComplexFloatAlignment
+);
+DECLARE_UNITTEST_WITH_NAME(
+  THRUST_PP_EXPAND_ARGS(TestComplexAlignment<double, double2>)
+, TestComplexDoubleAlignment
+);
diff --git a/testing/cuda/complex.mk b/testing/cuda/complex.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/complex.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/copy.cu b/testing/cuda/copy.cu
similarity index 89%
rename from testing/backend/cuda/copy.cu
rename to testing/cuda/copy.cu
index d37a9c1ef..6fe91853d 100644
--- a/testing/backend/cuda/copy.cu
+++ b/testing/cuda/copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -22,6 +23,10 @@ void TestCopyDevice(ExecutionPolicy exec, size_t n)
   
   thrust::copy(h_src.begin(), h_src.end(), h_dst.begin());
   copy_kernel<<<1,1>>>(exec, d_src.begin(), d_src.end(), d_dst.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_dst, d_dst);
 }
@@ -62,6 +67,10 @@ void TestCopyNDevice(ExecutionPolicy exec, size_t n)
   
   thrust::copy_n(h_src.begin(), h_src.size(), h_dst.begin());
   copy_n_kernel<<<1,1>>>(exec, d_src.begin(), d_src.size(), d_dst.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_dst, d_dst);
 }
@@ -81,4 +90,5 @@ void TestCopyNDeviceDevice(size_t n)
   TestCopyNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestCopyNDeviceDevice);
+#endif
 
diff --git a/testing/cuda/copy.mk b/testing/cuda/copy.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/copy_if.cu b/testing/cuda/copy_if.cu
similarity index 73%
rename from testing/backend/cuda/copy_if.cu
rename to testing/cuda/copy_if.cu
index 34b7fd366..bb879b671 100644
--- a/testing/backend/cuda/copy_if.cu
+++ b/testing/cuda/copy_if.cu
@@ -3,7 +3,6 @@
 #include <thrust/sequence.h>
 #include <thrust/execution_policy.h>
 
-
 template<typename T>
 struct is_even
 {
@@ -20,6 +19,7 @@ struct mod_3
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Predicate pred, Iterator3 result2)
 {
@@ -47,7 +47,11 @@ void TestCopyIfDevice(ExecutionPolicy exec)
     thrust::device_vector<int> d_result(n);
     
     h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<int>());
+
     copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_even<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
     d_new_end = d_new_end_vec[0];
     
     h_result.resize(h_new_end - h_result.begin());
@@ -62,7 +66,11 @@ void TestCopyIfDevice(ExecutionPolicy exec)
     thrust::device_vector<int> d_result(n);
     
     h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<int>());
+
     copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), mod_3<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
     d_new_end = d_new_end_vec[0];
     
     h_result.resize(h_new_end - h_result.begin());
@@ -87,10 +95,17 @@ void TestCopyIfDeviceDevice()
 DECLARE_UNITTEST(TestCopyIfDeviceDevice);
 
 
-void TestCopyIfCudaStreams()
+void TestCopyIfDeviceNoSync()
+{
+  TestCopyIfDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfDeviceNoSync);
+#endif
+
+template<typename ExecutionPolicy>
+void TestCopyIfCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -104,11 +119,11 @@ void TestCopyIfCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
-                                                  data.begin(), 
-                                                  data.end(), 
-                                                  result.begin(),
-                                                  is_even<int>());
+  Vector::iterator end = thrust::copy_if(policy.on(s),
+                                         data.begin(), 
+                                         data.end(), 
+                                         result.begin(),
+                                         is_even<int>());
 
   ASSERT_EQUAL(end - result.begin(), 2);
 
@@ -117,9 +132,19 @@ void TestCopyIfCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfCudaStreams);
 
+void TestCopyIfCudaStreamsSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsSync);
+
+void TestCopyIfCudaStreamsNoSync(){
+  TestCopyIfCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfCudaStreamsNoSync);
 
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__ void copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 result1, Predicate pred, Iterator4 result2)
 {
@@ -137,9 +162,6 @@ void TestCopyIfStencilDevice(ExecutionPolicy exec)
   thrust::host_vector<int>   h_stencil = unittest::random_integers<int>(n);
   thrust::device_vector<int> d_stencil = unittest::random_integers<int>(n);
   
-  thrust::host_vector<int>   h_result(n);
-  thrust::device_vector<int> d_result(n);
-  
   typename thrust::host_vector<int>::iterator   h_new_end;
   typename thrust::device_vector<int>::iterator d_new_end;
 
@@ -153,7 +175,11 @@ void TestCopyIfStencilDevice(ExecutionPolicy exec)
     thrust::device_vector<int> d_result(n);
     
     h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_even<int>());
+
     copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_even<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
     d_new_end = d_new_end_vec[0];
     
     h_result.resize(h_new_end - h_result.begin());
@@ -168,7 +194,11 @@ void TestCopyIfStencilDevice(ExecutionPolicy exec)
     thrust::device_vector<int> d_result(n);
     
     h_new_end = thrust::copy_if(h_data.begin(), h_data.end(), h_result.begin(), mod_3<int>());
+
     copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), mod_3<int>(), d_new_end_vec.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
     d_new_end = d_new_end_vec[0];
     
     h_result.resize(h_new_end - h_result.begin());
@@ -193,10 +223,19 @@ void TestCopyIfStencilDeviceDevice()
 DECLARE_UNITTEST(TestCopyIfStencilDeviceDevice);
 
 
-void TestCopyIfStencilCudaStreams()
+void TestCopyIfStencilDeviceNoSync()
+{
+  TestCopyIfStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestCopyIfStencilCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -217,12 +256,12 @@ void TestCopyIfStencilCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::copy_if(thrust::cuda::par.on(s),
-                                                  data.begin(), 
-                                                  data.end(),
-                                                  stencil.begin(),
-                                                  result.begin(),
-                                                  thrust::identity<T>());
+  Vector::iterator end = thrust::copy_if(policy.on(s),
+                                         data.begin(), 
+                                         data.end(),
+                                         stencil.begin(),
+                                         result.begin(),
+                                         thrust::identity<T>());
 
   ASSERT_EQUAL(end - result.begin(), 2);
 
@@ -231,5 +270,17 @@ void TestCopyIfStencilCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestCopyIfStencilCudaStreams);
+
+void TestCopyIfStencilCudaStreamsSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsSync);
+
+
+void TestCopyIfStencilCudaStreamsNoSync()
+{
+  TestCopyIfStencilCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestCopyIfStencilCudaStreamsNoSync);
 
diff --git a/testing/cuda/copy_if.mk b/testing/cuda/copy_if.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/copy_if.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/count.cu b/testing/cuda/count.cu
similarity index 93%
rename from testing/backend/cuda/count.cu
rename to testing/cuda/count.cu
index e0a14b9b1..e2b9b5f5a 100644
--- a/testing/backend/cuda/count.cu
+++ b/testing/cuda/count.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__
 void count_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
@@ -22,6 +23,8 @@ void TestCountDevice(ExecutionPolicy exec, const size_t n)
   size_t h_result = thrust::count(h_data.begin(), h_data.end(), T(5));
 
   count_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), T(5), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_result, d_result[0]);
 }
@@ -68,6 +71,8 @@ void TestCountIfDevice(ExecutionPolicy exec, const size_t n)
   
   size_t h_result = thrust::count_if(h_data.begin(), h_data.end(), greater_than_five<T>());
   count_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), greater_than_five<T>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_result, d_result[0]);
 }
@@ -87,6 +92,7 @@ void TestCountIfDeviceDevice(const size_t n)
   TestCountIfDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestCountIfDeviceDevice);
+#endif
 
 
 void TestCountCudaStreams()
diff --git a/testing/cuda/count.mk b/testing/cuda/count.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/count.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/cudart.cu b/testing/cuda/cudart.cu
similarity index 100%
rename from testing/backend/cuda/cudart.cu
rename to testing/cuda/cudart.cu
diff --git a/testing/cuda/cudart.mk b/testing/cuda/cudart.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/cudart.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/device_side_universal_vector.cu b/testing/cuda/device_side_universal_vector.cu
new file mode 100644
index 000000000..a31919cfc
--- /dev/null
+++ b/testing/cuda/device_side_universal_vector.cu
@@ -0,0 +1,84 @@
+#include <thrust/universal_vector.h>
+
+#include <unittest/unittest.h>
+
+template <class VecT>
+__host__ __device__ void universal_vector_access(VecT &in, thrust::universal_vector<bool> &out)
+{
+  const int expected_front  = 4;
+  const int expected_back   = 2;
+
+  out[0] = in.size() == 2 &&               //
+           in[0] == expected_front &&      //
+           in.front() == expected_front && //
+           *in.data() == expected_front && //
+           in[1] == expected_back &&       //
+           in.back() == expected_back;
+}
+
+#if defined(THRUST_TEST_DEVICE_SIDE)
+template <class VecT>
+__global__ void universal_vector_device_access_kernel(VecT &vec,
+                                                      thrust::universal_vector<bool> &out)
+{
+  universal_vector_access(vec, out);
+}
+
+template <class VecT>
+void test_universal_vector_access(VecT &vec, thrust::universal_vector<bool> &out)
+{
+  universal_vector_device_access_kernel<<<1, 1>>>(vec, out);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  ASSERT_EQUAL(out[0], true);
+}
+#else
+template <class VecT>
+void test_universal_vector_access(VecT &vec, thrust::universal_vector<bool> &out)
+{
+  universal_vector_access(vec, out);
+  ASSERT_EQUAL(out[0], true);
+}
+#endif
+
+void TestUniversalVectorDeviceAccess()
+{
+  thrust::universal_vector<thrust::universal_vector<int>> in_storage(1);
+  thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+  in.resize(2);
+  in[0] = 4;
+  in[1] = 2;
+
+  thrust::universal_vector<thrust::universal_vector<bool>> out_storage(1);
+  thrust::universal_vector<bool> &out = *thrust::raw_pointer_cast(out_storage.data());
+  out.resize(1);
+  out[0] = false;
+
+  test_universal_vector_access(in, out);
+}
+DECLARE_UNITTEST(TestUniversalVectorDeviceAccess);
+
+void TestConstUniversalVectorDeviceAccess()
+{
+  thrust::universal_vector<thrust::universal_vector<int>> in_storage(1);
+
+  {
+    thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+    in.resize(2);
+    in[0] = 4;
+    in[1] = 2;
+  }
+
+  const thrust::universal_vector<int> &in = *thrust::raw_pointer_cast(in_storage.data());
+
+  thrust::universal_vector<thrust::universal_vector<bool>> out_storage(1);
+  thrust::universal_vector<bool> &out = *thrust::raw_pointer_cast(out_storage.data());
+
+  out.resize(1);
+  out[0] = false;
+
+  test_universal_vector_access(in, out);
+}
+DECLARE_UNITTEST(TestConstUniversalVectorDeviceAccess);
diff --git a/testing/backend/cuda/equal.cu b/testing/cuda/equal.cu
similarity index 86%
rename from testing/backend/cuda/equal.cu
rename to testing/cuda/equal.cu
index c0ac4418d..c5e794ed5 100644
--- a/testing/backend/cuda/equal.cu
+++ b/testing/cuda/equal.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void equal_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
@@ -29,10 +30,20 @@ void TestEqualDevice(ExecutionPolicy exec, const size_t n)
   
   //empty ranges
   equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin(), d_data1.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_result[0], true);
   
   //symmetric cases
   equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.end(), d_data1.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_result[0], true);
   
   if(n > 0)
@@ -41,12 +52,28 @@ void TestEqualDevice(ExecutionPolicy exec, const size_t n)
     
     //different vectors
     equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.end(), d_data2.begin(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(d_result[0], false);
     
     //different predicates
     equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::less<T>(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(d_result[0], true);
+
     equal_kernel<<<1,1>>>(exec, d_data1.begin(), d_data1.begin() + 1, d_data2.begin(), thrust::greater<T>(), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(d_result[0], false);
   }
 }
@@ -66,6 +93,7 @@ void TestEqualDeviceDevice(const size_t n)
   TestEqualDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestEqualDeviceDevice);
+#endif
 
 
 void TestEqualCudaStreams()
diff --git a/testing/cuda/equal.mk b/testing/cuda/equal.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/equal.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/fill.cu b/testing/cuda/fill.cu
similarity index 83%
rename from testing/backend/cuda/fill.cu
rename to testing/cuda/fill.cu
index d774a28bc..ee0a51776 100644
--- a/testing/backend/cuda/fill.cu
+++ b/testing/cuda/fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T>
 __global__
 void fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value)
@@ -18,27 +19,52 @@ void TestFillDevice(ExecutionPolicy exec, size_t n)
   thrust::device_vector<T> d_data = h_data;
   
   thrust::fill(h_data.begin() + std::min((size_t)1, n), h_data.begin() + std::min((size_t)3, n), (T) 0);
+
   fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)1, n), d_data.begin() + std::min((size_t)3, n), (T) 0);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   thrust::fill(h_data.begin() + std::min((size_t)117, n), h_data.begin() + std::min((size_t)367, n), (T) 1);
+
   fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2);
+
   fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3);
+
   fill_kernel<<<1,1>>>(exec, d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   thrust::fill(h_data.begin(), h_data.end(), (T) 4);
+
   fill_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), (T) 4);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
 }
@@ -73,31 +99,60 @@ void TestFillNDevice(ExecutionPolicy exec, size_t n)
   thrust::device_vector<T> d_data = h_data;
   
   size_t begin_offset = std::min<size_t>(1,n);
+
   thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
+
   fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)3, n) - begin_offset, (T) 0);
-  
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
   ASSERT_EQUAL(h_data, d_data);
   
   begin_offset = std::min<size_t>(117, n);
+
   thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+
   fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   begin_offset = std::min<size_t>(8, n);
+
   thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+
   fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   begin_offset = std::min<size_t>(3, n);
+
   thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3);
+
   fill_n_kernel<<<1,1>>>(exec, d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
   
   thrust::fill_n(h_data.begin(), h_data.size(), (T) 4);
+
   fill_n_kernel<<<1,1>>>(exec, d_data.begin(), d_data.size(), (T) 4);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_data, d_data);
 }
@@ -115,6 +170,7 @@ void TestFillNDeviceDevice(size_t n)
   TestFillNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestFillNDeviceDevice);
+#endif
 
 void TestFillCudaStreams()
 {
diff --git a/testing/cuda/fill.mk b/testing/cuda/fill.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/find.cu b/testing/cuda/find.cu
similarity index 90%
rename from testing/backend/cuda/find.cu
rename to testing/cuda/find.cu
index 16b33b40d..fbd86f5a0 100644
--- a/testing/backend/cuda/find.cu
+++ b/testing/cuda/find.cu
@@ -39,6 +39,7 @@ struct less_than_value_pred
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__ void find_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T value, Iterator2 result)
 {
@@ -60,15 +61,27 @@ void TestFindDevice(ExecutionPolicy exec)
   thrust::device_vector<iter_type> d_result(1);
   
   h_iter = thrust::find(h_data.begin(), h_data.end(), int(0));
+
   find_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), int(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   
   for(size_t i = 1; i < n; i *= 2)
   {
     int sample = h_data[i];
+
     h_iter = thrust::find(h_data.begin(), h_data.end(), sample);
+
     find_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), sample, d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   }
 }
@@ -109,14 +122,27 @@ void TestFindIfDevice(ExecutionPolicy exec)
   thrust::device_vector<iter_type> d_result(1);
   
   h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<int>(0));
+
   find_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), equal_to_value_pred<int>(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   
   for (size_t i = 1; i < n; i *= 2)
   {
     int sample = h_data[i];
+
     h_iter = thrust::find_if(h_data.begin(), h_data.end(), equal_to_value_pred<int>(sample));
+
     find_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), equal_to_value_pred<int>(sample), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   }
 }
@@ -156,14 +182,27 @@ void TestFindIfNotDevice(ExecutionPolicy exec)
   thrust::device_vector<iter_type> d_result(1);
   
   h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<int>(0));
+
   find_if_not_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), not_equal_to_value_pred<int>(0), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   
   for(size_t i = 1; i < n; i *= 2)
   {
     int sample = h_data[i];
+
     h_iter = thrust::find_if_not(h_data.begin(), h_data.end(), not_equal_to_value_pred<int>(sample));
+
     find_if_not_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), not_equal_to_value_pred<int>(sample), d_result.begin());
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
+
     ASSERT_EQUAL(h_iter - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
   }
 }
@@ -181,6 +220,7 @@ void TestFindIfNotDeviceDevice()
   TestFindIfNotDevice(thrust::device);
 };
 DECLARE_UNITTEST(TestFindIfNotDeviceDevice);
+#endif
 
 
 void TestFindCudaStreams()
diff --git a/testing/cuda/find.mk b/testing/cuda/find.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/find.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/for_each.cu b/testing/cuda/for_each.cu
similarity index 88%
rename from testing/backend/cuda/for_each.cu
rename to testing/cuda/for_each.cu
index ab6570a9d..afd54c621 100644
--- a/testing/backend/cuda/for_each.cu
+++ b/testing/cuda/for_each.cu
@@ -6,7 +6,7 @@
 static const size_t NUM_REGISTERS = 64;
 
 template <size_t N> __host__ __device__ void f   (int * x) { int temp = *x; f<N - 1>(x + 1); *x = temp;};
-template <>         __host__ __device__ void f<0>(int * x) { }
+template <>         __host__ __device__ void f<0>(int * /*x*/) { }
 template <size_t N>
 struct CopyFunctorWithManyRegisters
 {
@@ -59,6 +59,7 @@ struct mark_present_for_each
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function>
 __global__ void for_each_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
 {
@@ -89,7 +90,9 @@ void TestForEachDeviceSeq(const size_t n)
   thrust::for_each(h_input.begin(), h_input.end(), h_f);
   
   for_each_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.end(), d_f);
-  
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+ 
   ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceSeq);
@@ -103,7 +106,7 @@ void TestForEachDeviceDevice(const size_t n)
   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
   
   for(size_t i = 0; i < n; i++)
-    h_input[i] =  ((size_t) h_input[i]) % output_size;
+    h_input[i] = ((size_t) h_input[i]) % output_size;
   
   thrust::device_vector<T> d_input = h_input;
   
@@ -118,7 +121,15 @@ void TestForEachDeviceDevice(const size_t n)
   thrust::for_each(h_input.begin(), h_input.end(), h_f);
   
   for_each_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.end(), d_f);
-  
+  {
+    cudaError_t const err = cudaGetLastError();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
   ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestForEachDeviceDevice);
@@ -140,7 +151,7 @@ void TestForEachNDeviceSeq(const size_t n)
   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
   
   for(size_t i = 0; i < n; i++)
-    h_input[i] =  ((size_t) h_input[i]) % output_size;
+    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
   
   thrust::device_vector<T> d_input = h_input;
   
@@ -155,6 +166,8 @@ void TestForEachNDeviceSeq(const size_t n)
   thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
   
   for_each_n_kernel<<<1,1>>>(thrust::seq, d_input.begin(), d_input.size(), d_f);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_output, d_output);
 }
@@ -169,7 +182,7 @@ void TestForEachNDeviceDevice(const size_t n)
   thrust::host_vector<T> h_input = unittest::random_integers<T>(n);
   
   for(size_t i = 0; i < n; i++)
-    h_input[i] =  ((size_t) h_input[i]) % output_size;
+    h_input[i] =  static_cast<T>(((size_t) h_input[i]) % output_size);
   
   thrust::device_vector<T> d_input = h_input;
   
@@ -184,10 +197,13 @@ void TestForEachNDeviceDevice(const size_t n)
   thrust::for_each_n(h_input.begin(), h_input.size(), h_f);
   
   for_each_n_kernel<<<1,1>>>(thrust::device, d_input.begin(), d_input.size(), d_f);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestForEachNDeviceDevice);
+#endif
 
 
 void TestForEachCudaStreams()
diff --git a/testing/cuda/for_each.mk b/testing/cuda/for_each.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/for_each.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/gather.cu b/testing/cuda/gather.cu
similarity index 95%
rename from testing/backend/cuda/gather.cu
rename to testing/cuda/gather.cu
index 1ac0c4cf5..6af4d4727 100644
--- a/testing/backend/cuda/gather.cu
+++ b/testing/cuda/gather.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void gather_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 elements_first, Iterator3 result)
@@ -33,7 +34,12 @@ void TestGatherDevice(ExecutionPolicy exec, const size_t n)
   thrust::device_vector<T> d_output(n);
   
   thrust::gather(h_map.begin(), h_map.end(), h_source.begin(), h_output.begin());
+
   gather_kernel<<<1,1>>>(exec, d_map.begin(), d_map.end(), d_source.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_output, d_output);
 }
@@ -51,6 +57,7 @@ void TestGatherDeviceDevice(const size_t n)
   TestGatherDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGatherDeviceDevice);
+#endif
 
 
 void TestGatherCudaStreams()
@@ -80,6 +87,7 @@ void TestGatherCudaStreams()
 DECLARE_UNITTEST(TestGatherCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate>
 __global__
 void gather_if_kernel(ExecutionPolicy exec, Iterator1 map_first, Iterator1 map_last, Iterator2 stencil_first, Iterator3 elements_first, Iterator4 result, Predicate pred)
@@ -129,7 +137,12 @@ void TestGatherIfDevice(ExecutionPolicy exec, const size_t n)
   thrust::device_vector<T> d_output(n);
   
   thrust::gather_if(h_map.begin(), h_map.end(), h_stencil.begin(), h_source.begin(), h_output.begin(), is_even_gather_if<unsigned int>());
+
   gather_if_kernel<<<1,1>>>(exec, d_map.begin(), d_map.end(), d_stencil.begin(), d_source.begin(), d_output.begin(), is_even_gather_if<unsigned int>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_output, d_output);
 }
@@ -147,6 +160,7 @@ void TestGatherIfDeviceDevice(const size_t n)
   TestGatherIfDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGatherIfDeviceDevice);
+#endif
 
 void TestGatherIfCudaStreams(void)
 {
diff --git a/testing/cuda/gather.mk b/testing/cuda/gather.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/gather.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/generate.cu b/testing/cuda/generate.cu
similarity index 92%
rename from testing/backend/cuda/generate.cu
rename to testing/cuda/generate.cu
index acf9513ae..407da920c 100644
--- a/testing/backend/cuda/generate.cu
+++ b/testing/cuda/generate.cu
@@ -3,14 +3,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator, typename Function>
-__global__
-void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
-{
-  thrust::generate(exec, first, last, f);
-}
-
-
 template<typename T>
 struct return_value
 {
@@ -24,6 +16,15 @@ struct return_value
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator, typename Function>
+__global__
+void generate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
+{
+  thrust::generate(exec, first, last, f);
+}
+
+
 template<typename T, typename ExecutionPolicy>
 void TestGenerateDevice(ExecutionPolicy exec, const size_t n)
 {
@@ -34,7 +35,12 @@ void TestGenerateDevice(ExecutionPolicy exec, const size_t n)
   return_value<T> f(value);
   
   thrust::generate(h_result.begin(), h_result.end(), f);
+
   generate_kernel<<<1,1>>>(exec, d_result.begin(), d_result.end(), f);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_result, d_result);
 }
@@ -54,6 +60,7 @@ void TestGenerateDeviceDevice(const size_t n)
   TestGenerateDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGenerateDeviceDevice);
+#endif
 
 
 void TestGenerateCudaStreams()
@@ -81,6 +88,7 @@ void TestGenerateCudaStreams()
 DECLARE_UNITTEST(TestGenerateCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Size, typename Function>
 __global__
 void generate_n_kernel(ExecutionPolicy exec, Iterator first, Size n, Function f)
@@ -99,7 +107,12 @@ void TestGenerateNDevice(ExecutionPolicy exec, const size_t n)
   return_value<T> f(value);
   
   thrust::generate_n(h_result.begin(), h_result.size(), f);
+
   generate_n_kernel<<<1,1>>>(exec, d_result.begin(), d_result.size(), f);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(h_result, d_result);
 }
@@ -119,6 +132,7 @@ void TestGenerateNDeviceDevice(const size_t n)
   TestGenerateNDevice<T>(thrust::device, n);
 }
 DECLARE_VARIABLE_UNITTEST(TestGenerateNDeviceDevice);
+#endif
 
 
 void TestGenerateNCudaStreams()
diff --git a/testing/cuda/generate.mk b/testing/cuda/generate.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/generate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/inner_product.cu b/testing/cuda/inner_product.cu
similarity index 92%
rename from testing/backend/cuda/inner_product.cu
rename to testing/cuda/inner_product.cu
index fbb8bbee8..0c2276942 100644
--- a/testing/backend/cuda/inner_product.cu
+++ b/testing/cuda/inner_product.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename T, typename Iterator3>
 __global__
 void inner_product_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, T init, Iterator3 result)
@@ -27,7 +28,12 @@ void TestInnerProductDevice(ExecutionPolicy exec)
   int init = 13;
   
   int expected = thrust::inner_product(h_v1.begin(), h_v1.end(), h_v2.begin(), init);
+
   inner_product_kernel<<<1,1>>>(exec, d_v1.begin(), d_v1.end(), d_v2.begin(), init, result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(expected, result[0]);
 }
@@ -45,6 +51,7 @@ void TestInnerProductDeviceDevice()
   TestInnerProductDevice(thrust::device);
 };
 DECLARE_UNITTEST(TestInnerProductDeviceDevice);
+#endif
 
 
 void TestInnerProductCudaStreams()
diff --git a/testing/cuda/inner_product.mk b/testing/cuda/inner_product.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/inner_product.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/is_partitioned.cu b/testing/cuda/is_partitioned.cu
similarity index 91%
rename from testing/backend/cuda/is_partitioned.cu
rename to testing/cuda/is_partitioned.cu
index 420b7d9a2..468e17746 100644
--- a/testing/backend/cuda/is_partitioned.cu
+++ b/testing/cuda/is_partitioned.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Predicate, typename Iterator2>
 __global__
 void is_partitioned_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Predicate pred, Iterator2 result)
@@ -35,12 +36,20 @@ void TestIsPartitionedDevice(ExecutionPolicy exec)
   v[1] = 0;
 
   is_partitioned_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(false, result[0]);
 
   thrust::partition(v.begin(), v.end(), is_even<int>());
 
   is_partitioned_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(true, result[0]);
 }
@@ -58,6 +67,7 @@ void TestIsPartitionedDeviceDevice()
   TestIsPartitionedDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsPartitionedDeviceDevice);
+#endif
 
 
 void TestIsPartitionedCudaStreams()
diff --git a/testing/cuda/is_partitioned.mk b/testing/cuda/is_partitioned.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_partitioned.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/is_sorted.cu b/testing/cuda/is_sorted.cu
similarity index 90%
rename from testing/backend/cuda/is_sorted.cu
rename to testing/cuda/is_sorted.cu
index 9b713bcd4..1e9ef16ae 100644
--- a/testing/backend/cuda/is_sorted.cu
+++ b/testing/cuda/is_sorted.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void is_sorted_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -24,11 +25,22 @@ void TestIsSortedDevice(ExecutionPolicy exec)
   v[1] = 0;
 
   is_sorted_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 
   thrust::sort(v.begin(), v.end());
 
   is_sorted_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 }
 
@@ -44,6 +56,7 @@ void TestIsSortedDeviceDevice()
   TestIsSortedDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsSortedDeviceDevice);
+#endif
 
 
 void TestIsSortedCudaStreams()
diff --git a/testing/cuda/is_sorted.mk b/testing/cuda/is_sorted.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_sorted.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/is_sorted_until.cu b/testing/cuda/is_sorted_until.cu
similarity index 90%
rename from testing/backend/cuda/is_sorted_until.cu
rename to testing/cuda/is_sorted_until.cu
index 0639e5ef5..9e6d5ac76 100644
--- a/testing/backend/cuda/is_sorted_until.cu
+++ b/testing/cuda/is_sorted_until.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void is_sorted_until_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -26,11 +27,21 @@ void TestIsSortedUntilDevice(ExecutionPolicy exec)
   v[1] = 0;
   
   is_sorted_until_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL_QUIET(v.begin() + 1, (iter_type)result[0]);
   
   thrust::sort(v.begin(), v.end());
   
   is_sorted_until_kernel<<<1,1>>>(exec, v.begin(), v.end(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL_QUIET(v.end(), (iter_type)result[0]);
 }
 
@@ -47,14 +58,15 @@ void TestIsSortedUntilDeviceDevice()
   TestIsSortedUntilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestIsSortedUntilDeviceDevice);
+#endif
 
 
 void TestIsSortedUntilCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
 
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::value_type T;
+  typedef Vector::iterator Iterator;
 
   cudaStream_t s;
   cudaStreamCreate(&s);
diff --git a/testing/cuda/is_sorted_until.mk b/testing/cuda/is_sorted_until.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/is_sorted_until.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/logical.cu b/testing/cuda/logical.cu
similarity index 78%
rename from testing/backend/cuda/logical.cu
rename to testing/cuda/logical.cu
index b9873775c..a08f041b7 100644
--- a/testing/backend/cuda/logical.cu
+++ b/testing/cuda/logical.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void all_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -20,23 +21,53 @@ void TestAllOfDevice(ExecutionPolicy exec)
   thrust::device_vector<bool> result(1);
   
   all_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
   
   v[1] = 0;
   
   all_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
   
   all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 
   all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 
   all_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 
   all_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 }
 
@@ -53,12 +84,13 @@ void TestAllOfDeviceDevice()
   TestAllOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestAllOfDeviceDevice);
+#endif
 
 
 void TestAllOfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
   
   Vector v(3, 1);
 
@@ -81,6 +113,7 @@ void TestAllOfCudaStreams()
 DECLARE_UNITTEST(TestAllOfCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void any_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -98,23 +131,53 @@ void TestAnyOfDevice(ExecutionPolicy exec)
   thrust::device_vector<bool> result(1);
   
   any_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
   
   v[1] = 0;
   
   any_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
   
   any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 
   any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 
   any_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 
   any_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 }
 
@@ -131,12 +194,13 @@ void TestAnyOfDeviceDevice()
   TestAnyOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestAnyOfDeviceDevice);
+#endif
 
 
 void TestAnyOfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector v(3, 1);
 
@@ -159,6 +223,7 @@ void TestAnyOfCudaStreams()
 DECLARE_UNITTEST(TestAnyOfCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function, typename Iterator2>
 __global__
 void none_of_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f, Iterator2 result)
@@ -176,23 +241,53 @@ void TestNoneOfDevice(ExecutionPolicy exec)
   thrust::device_vector<bool> result(1);
   
   none_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
   
   v[1] = 0;
   
   none_of_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
   
   none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 0, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 
   none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 1, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 
   none_of_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(false, result[0]);
 
   none_of_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 2, thrust::identity<T>(), result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(true, result[0]);
 }
 
@@ -209,12 +304,13 @@ void TestNoneOfDeviceDevice()
   TestNoneOfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestNoneOfDeviceDevice);
+#endif
 
 
 void TestNoneOfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector v(3, 1);
 
diff --git a/testing/cuda/logical.mk b/testing/cuda/logical.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/logical.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/managed_memory_pointer.mk b/testing/cuda/managed_memory_pointer.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/managed_memory_pointer.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/max_element.cu b/testing/cuda/max_element.cu
similarity index 53%
rename from testing/backend/cuda/max_element.cu
rename to testing/cuda/max_element.cu
index e80fd9fc6..defc314d1 100644
--- a/testing/backend/cuda/max_element.cu
+++ b/testing/cuda/max_element.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void max_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -33,12 +34,22 @@ void TestMaxElementDevice(ExecutionPolicy exec)
   typename thrust::host_vector<int>::iterator   h_max = thrust::max_element(h_data.begin(), h_data.end());
 
   max_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_max - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
 
   
   typename thrust::host_vector<int>::iterator   h_min = thrust::max_element(h_data.begin(), h_data.end(), thrust::greater<int>());
 
   max_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_min - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
 }
 
@@ -57,10 +68,19 @@ void TestMaxElementDeviceDevice()
 DECLARE_UNITTEST(TestMaxElementDeviceDevice);
 
 
-void TestMaxElementCudaStreams()
+void TestMaxElementDeviceNoSync()
+{
+  TestMaxElementDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestMaxElementCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(6);
   data[0] = 3;
@@ -73,13 +93,45 @@ void TestMaxElementCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()), 5);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end()) - data.begin(), 1);
+  auto streampolicy = policy.on(s);
+
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end()), 5);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end()) - data.begin(), 1);
   
-  ASSERT_EQUAL( *thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()), 1);
-  ASSERT_EQUAL( thrust::max_element(thrust::cuda::par.on(s), data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
+  ASSERT_EQUAL( *thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()), 1);
+  ASSERT_EQUAL( thrust::max_element(streampolicy, data.begin(), data.end(), thrust::greater<T>()) - data.begin(), 2);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestMaxElementCudaStreams);
 
+void TestMaxElementCudaStreamsSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsSync);
+
+
+void TestMaxElementCudaStreamsNoSync(){
+  TestMaxElementCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestMaxElementCudaStreamsNoSync);
+
+
+void TestMaxElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::max_element(thrust::device, raw_ptr, raw_ptr+n) - raw_ptr, 1);
+  ASSERT_EQUAL( thrust::max_element(thrust::device, raw_ptr, raw_ptr+n, thrust::greater<T>()) - raw_ptr, 2);
+}
+DECLARE_UNITTEST(TestMaxElementDevicePointer);
diff --git a/testing/cuda/max_element.mk b/testing/cuda/max_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/max_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/memory.cu b/testing/cuda/memory.cu
similarity index 75%
rename from testing/backend/cuda/memory.cu
rename to testing/cuda/memory.cu
index 98fead8dc..eda432ca8 100644
--- a/testing/backend/cuda/memory.cu
+++ b/testing/cuda/memory.cu
@@ -26,7 +26,7 @@ void TestSelectSystemCudaToCpp()
 
   thrust::cuda::tag cuda_tag;
   thrust::cpp::tag cpp_tag;
-  thrust::system::cuda::detail::cross_system<thrust::cuda::tag,thrust::cpp::tag> cuda_to_cpp(cuda_tag, cpp_tag);
+  thrust::cuda_cub::cross_system<thrust::cuda::tag,thrust::cpp::tag> cuda_to_cpp(cuda_tag, cpp_tag);
 
   // select_system(cuda::tag, thrust::host_system_tag) should return cuda_to_cpp
   bool is_cuda_to_cpp = are_same_type(cuda_to_cpp, select_system(cuda_tag, cpp_tag));
@@ -35,6 +35,7 @@ void TestSelectSystemCudaToCpp()
 DECLARE_UNITTEST(TestSelectSystemCudaToCpp);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename Iterator>
 __global__ void get_temporary_buffer_kernel(size_t n, Iterator result)
 {
@@ -43,21 +44,25 @@ __global__ void get_temporary_buffer_kernel(size_t n, Iterator result)
 
 
 template<typename Pointer>
-__global__ void return_temporary_buffer_kernel(Pointer ptr)
+__global__ void return_temporary_buffer_kernel(Pointer ptr, std::ptrdiff_t n)
 {
-  thrust::return_temporary_buffer(thrust::seq, ptr);
+  thrust::return_temporary_buffer(thrust::seq, ptr, n);
 }
 
 
 void TestGetTemporaryBufferDeviceSeq()
 {
-  const size_t n = 9001;
+  const std::ptrdiff_t n = 9001;
 
   typedef thrust::pointer<int, thrust::detail::seq_t> pointer;
   typedef thrust::pair<pointer, std::ptrdiff_t> ptr_and_sz_type;
   thrust::device_vector<ptr_and_sz_type> d_result(1);
   
   get_temporary_buffer_kernel<<<1,1>>>(n, d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ptr_and_sz_type ptr_and_sz = d_result[0];
 
@@ -72,7 +77,11 @@ void TestGetTemporaryBufferDeviceSeq()
 
     ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-    return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first);
+    return_temporary_buffer_kernel<<<1,1>>>(ptr_and_sz.first, ptr_and_sz.second);
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
   }
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDeviceSeq);
@@ -94,12 +103,16 @@ __global__ void free_kernel(Pointer ptr)
 
 void TestMallocDeviceSeq()
 {
-  const size_t n = 9001;
+  const std::ptrdiff_t n = 9001;
 
   typedef thrust::pointer<int, thrust::detail::seq_t> pointer;
   thrust::device_vector<pointer> d_result(1);
   
   malloc_kernel<<<1,1>>>(n, d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   pointer ptr = d_result[0];
 
@@ -113,7 +126,12 @@ void TestMallocDeviceSeq()
     ASSERT_EQUAL(true, thrust::all_of(thrust::device, ptr, ptr + n, thrust::placeholders::_1 == ref_val));
 
     free_kernel<<<1,1>>>(ptr);
+    {
+      cudaError_t const err = cudaDeviceSynchronize();
+      ASSERT_EQUAL(cudaSuccess, err);
+    }
   }
 }
 DECLARE_UNITTEST(TestMallocDeviceSeq);
+#endif
 
diff --git a/testing/cuda/memory.mk b/testing/cuda/memory.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/memory.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/merge.cu b/testing/cuda/merge.cu
similarity index 94%
rename from testing/backend/cuda/merge.cu
rename to testing/cuda/merge.cu
index ce205ed79..1a96e8774 100644
--- a/testing/backend/cuda/merge.cu
+++ b/testing/cuda/merge.cu
@@ -6,6 +6,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void merge_kernel(ExecutionPolicy exec,
@@ -58,6 +59,9 @@ void TestMergeDevice(ExecutionPolicy exec)
                           d_b.begin(), d_b.begin() + size,
                           d_result.begin(),
                           d_end.begin());
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+
     d_result.resize((iter_type)d_end[0] - d_result.begin());
 
     ASSERT_EQUAL(h_result, d_result);
@@ -77,12 +81,13 @@ void TestMergeDeviceDevice()
   TestMergeDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMergeDeviceDevice);
+#endif
 
 
 void TestMergeCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(3), b(4);
 
diff --git a/testing/cuda/merge.mk b/testing/cuda/merge.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/merge_by_key.cu b/testing/cuda/merge_by_key.cu
similarity index 96%
rename from testing/backend/cuda/merge_by_key.cu
rename to testing/cuda/merge_by_key.cu
index 59079df79..40ea542df 100644
--- a/testing/backend/cuda/merge_by_key.cu
+++ b/testing/cuda/merge_by_key.cu
@@ -5,6 +5,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy,
          typename Iterator1,
          typename Iterator2,
@@ -60,6 +61,9 @@ void TestMergeByKeyDevice(ExecutionPolicy exec)
                                result_key.begin(),
                                result_val.begin(),
                                result_ends.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   thrust::pair<Iterator,Iterator> ends = result_ends[0];
 
   ASSERT_EQUAL_QUIET(result_key.end(), ends.first);
@@ -81,12 +85,13 @@ void TestMergeByKeyDeviceDevice()
   TestMergeByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMergeByKeyDeviceDevice);
+#endif
 
 
 void TestMergeByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a_key(3), a_val(3), b_key(4), b_val(4);
 
diff --git a/testing/cuda/merge_by_key.mk b/testing/cuda/merge_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/merge_sort.cu b/testing/cuda/merge_sort.cu
similarity index 76%
rename from testing/backend/cuda/merge_sort.cu
rename to testing/cuda/merge_sort.cu
index 99d51650f..7a4c2aa2e 100644
--- a/testing/backend/cuda/merge_sort.cu
+++ b/testing/cuda/merge_sort.cu
@@ -89,61 +89,74 @@ void InitializeSimpleStableKeySortTest(Vector& unsorted_keys, Vector& sorted_key
 
 void TestMergeSortKeySimple(void)
 {
+#if 0
     typedef thrust::device_vector<int> Vector;
-    typedef typename Vector::value_type T;
+    typedef Vector::value_type T;
 
     Vector unsorted_keys;
     Vector   sorted_keys;
 
     InitializeSimpleKeySortTest(unsorted_keys, sorted_keys);
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), thrust::less<T>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), thrust::less<T>());
 
     ASSERT_EQUAL(unsorted_keys, sorted_keys);
+#else
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_UNITTEST(TestMergeSortKeySimple);
 
 
 void TestMergeSortKeyValueSimple(void)
 {
+#if 0
     typedef thrust::device_vector<int> Vector;
-    typedef typename Vector::value_type T;
+    typedef Vector::value_type T;
 
     Vector unsorted_keys, unsorted_values;
     Vector   sorted_keys,   sorted_values;
 
     InitializeSimpleKeyValueSortTest(unsorted_keys, unsorted_values, sorted_keys, sorted_values);
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort_by_key(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin(), thrust::less<T>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), unsorted_values.begin(), thrust::less<T>());
 
     ASSERT_EQUAL(unsorted_keys,   sorted_keys);
     ASSERT_EQUAL(unsorted_values, sorted_values);
+#else
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_UNITTEST(TestMergeSortKeyValueSimple);
 
 
 void TestMergeSortStableKeySimple(void)
 {
+#if 0
     typedef thrust::device_vector<int> Vector;
-    typedef typename Vector::value_type T;
+    typedef Vector::value_type T;
 
     Vector unsorted_keys;
     Vector   sorted_keys;
 
     InitializeSimpleStableKeySortTest(unsorted_keys, sorted_keys);
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), less_div_10<T>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, unsorted_keys.begin(), unsorted_keys.end(), less_div_10<T>());
 
     ASSERT_EQUAL(unsorted_keys,   sorted_keys);
+#else
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_UNITTEST(TestMergeSortStableKeySimple);
 
 
 void TestMergeSortDescendingKey(void)
 {
+#if 0
     const size_t n = 10027;
 
     thrust::host_vector<int>   h_data = unittest::random_integers<int>(n);
@@ -151,10 +164,13 @@ void TestMergeSortDescendingKey(void)
 
     thrust::sort(h_data.begin(), h_data.end(), thrust::greater<int>());
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::greater<int>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::greater<int>());
 
     ASSERT_EQUAL(h_data, d_data);
+#else
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_UNITTEST(TestMergeSortDescendingKey);
 
@@ -162,6 +178,7 @@ DECLARE_UNITTEST(TestMergeSortDescendingKey);
 template <typename T>
 void TestMergeSortAscendingKeyValue(const size_t n)
 {
+#if 0
     thrust::host_vector<T>   h_keys = unittest::random_integers<T>(n);
     thrust::device_vector<T> d_keys = h_keys;
     
@@ -170,17 +187,22 @@ void TestMergeSortAscendingKeyValue(const size_t n)
 
     thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::less<T>());
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::less<T>());
 
     ASSERT_EQUAL(h_keys,   d_keys);
     ASSERT_EQUAL(h_values, d_values);
+#else
+    (void)n;
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_VARIABLE_UNITTEST(TestMergeSortAscendingKeyValue);
 
 
 void TestMergeSortDescendingKeyValue(void)
 {
+#if 0
     const size_t n = 10027;
 
     thrust::host_vector<int>   h_keys = unittest::random_integers<int>(n);
@@ -191,11 +213,14 @@ void TestMergeSortDescendingKeyValue(void)
 
     thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), thrust::greater<int>());
 
-    thrust::cuda::tag cuda_tag;
-    thrust::system::cuda::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::greater<int>());
+    thrust::cuda_bulk::tag cuda_tag;
+    thrust::system::cuda_bulk::detail::detail::stable_merge_sort_by_key(cuda_tag, d_keys.begin(), d_keys.end(), d_values.begin(), thrust::greater<int>());
 
     ASSERT_EQUAL(h_keys,   d_keys);
     ASSERT_EQUAL(h_values, d_values);
+#else
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_UNITTEST(TestMergeSortDescendingKeyValue);
 
@@ -203,6 +228,7 @@ DECLARE_UNITTEST(TestMergeSortDescendingKeyValue);
 template<typename U>
 void TestMergeSortKeyValue(size_t n)
 {
+#if 0
   typedef key_value<U,U> T;
 
   thrust::host_vector<U> h_keys   = unittest::random_integers<U>(n);
@@ -217,10 +243,14 @@ void TestMergeSortKeyValue(size_t n)
   thrust::device_vector<T> d_data = h_data;
 
   thrust::stable_sort(h_data.begin(), h_data.end());
-  thrust::cuda::tag cuda_tag;
-  thrust::system::cuda::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::less<T>());
+  thrust::cuda_bulk::tag cuda_tag;
+  thrust::system::cuda_bulk::detail::detail::stable_merge_sort(cuda_tag, d_data.begin(), d_data.end(), thrust::less<T>());
 
   ASSERT_EQUAL_QUIET(h_data, d_data);
+#else
+    (void) n;
+    KNOWN_FAILURE;
+#endif
 }
 DECLARE_VARIABLE_UNITTEST(TestMergeSortKeyValue);
 
diff --git a/testing/cuda/merge_sort.mk b/testing/cuda/merge_sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/merge_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/min_element.cu b/testing/cuda/min_element.cu
similarity index 75%
rename from testing/backend/cuda/min_element.cu
rename to testing/cuda/min_element.cu
index ab98302de..38dd96b11 100644
--- a/testing/backend/cuda/min_element.cu
+++ b/testing/cuda/min_element.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Iterator2>
 __global__
 void min_element_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Iterator2 result)
@@ -33,12 +34,21 @@ void TestMinElementDevice(ExecutionPolicy exec)
   typename thrust::host_vector<int>::iterator   h_min = thrust::min_element(h_data.begin(), h_data.end());
 
   min_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_min - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
 
-  
   typename thrust::host_vector<int>::iterator   h_max = thrust::min_element(h_data.begin(), h_data.end(), thrust::greater<int>());
 
   min_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(h_max - h_data.begin(), (iter_type)d_result[0] - d_data.begin());
 }
 
@@ -55,12 +65,13 @@ void TestMinElementDeviceDevice()
   TestMinElementDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMinElementDeviceDevice);
+#endif
 
 
 void TestMinElementCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(6);
   data[0] = 3;
@@ -83,3 +94,22 @@ void TestMinElementCudaStreams()
 }
 DECLARE_UNITTEST(TestMinElementCudaStreams);
 
+void TestMinElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::min_element(thrust::device, raw_ptr, raw_ptr+n) - raw_ptr, 2);
+  ASSERT_EQUAL( thrust::min_element(thrust::device, raw_ptr, raw_ptr+n, thrust::greater<T>()) - raw_ptr, 1);
+}
+DECLARE_UNITTEST(TestMinElementDevicePointer);
diff --git a/testing/cuda/min_element.mk b/testing/cuda/min_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/min_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/minmax_element.cu b/testing/cuda/minmax_element.cu
similarity index 81%
rename from testing/backend/cuda/minmax_element.cu
rename to testing/cuda/minmax_element.cu
index 99db1a2c1..6376bc28b 100644
--- a/testing/backend/cuda/minmax_element.cu
+++ b/testing/cuda/minmax_element.cu
@@ -2,6 +2,7 @@
 #include <thrust/extrema.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void minmax_element_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -45,6 +46,11 @@ void TestMinMaxElementDevice(ExecutionPolicy exec)
   d_max = thrust::minmax_element(d_data.begin(), d_data.end()).second;
 
   minmax_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   d_min = ((pair_type)d_result[0]).first;
   d_max = ((pair_type)d_result[0]).second;
   
@@ -55,6 +61,11 @@ void TestMinMaxElementDevice(ExecutionPolicy exec)
   h_min = thrust::minmax_element(h_data.begin(), h_data.end(), thrust::greater<int>()).second;
 
   minmax_element_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), thrust::greater<int>(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   d_max = ((pair_type)d_result[0]).first;
   d_min = ((pair_type)d_result[0]).second;
   
@@ -75,12 +86,12 @@ void TestMinMaxElementDeviceDevice()
   TestMinMaxElementDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMinMaxElementDeviceDevice);
+#endif
 
 
 void TestMinMaxElementCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector data(6);
   data[0] = 3;
@@ -102,3 +113,23 @@ void TestMinMaxElementCudaStreams()
 }
 DECLARE_UNITTEST(TestMinMaxElementCudaStreams);
 
+void TestMinMaxElementDevicePointer()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(6);
+  data[0] = 3;
+  data[1] = 5;
+  data[2] = 1;
+  data[3] = 2;
+  data[4] = 5;
+  data[5] = 1;
+
+  T* raw_ptr = thrust::raw_pointer_cast(data.data());
+  size_t n = data.size();
+  ASSERT_EQUAL( thrust::minmax_element(thrust::device, raw_ptr, raw_ptr+n).first - raw_ptr,  2);
+  ASSERT_EQUAL( thrust::minmax_element(thrust::device, raw_ptr, raw_ptr+n).second - raw_ptr, 1);
+}
+DECLARE_UNITTEST(TestMinMaxElementDevicePointer);
+
diff --git a/testing/cuda/minmax_element.mk b/testing/cuda/minmax_element.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/minmax_element.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/mismatch.cu b/testing/cuda/mismatch.cu
similarity index 89%
rename from testing/backend/cuda/mismatch.cu
rename to testing/cuda/mismatch.cu
index be53501c1..aac89352a 100644
--- a/testing/backend/cuda/mismatch.cu
+++ b/testing/cuda/mismatch.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__ void mismatch_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 result)
 {
@@ -28,6 +29,10 @@ void TestMismatchDevice(ExecutionPolicy exec)
   thrust::device_vector<pair_type> d_result(1);
   
   mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(2, ((pair_type)d_result[0]).first  - a.begin());
   ASSERT_EQUAL(2, ((pair_type)d_result[0]).second - b.begin());
@@ -35,12 +40,22 @@ void TestMismatchDevice(ExecutionPolicy exec)
   b[2] = 3;
   
   mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(3, ((pair_type)d_result[0]).first  - a.begin());
   ASSERT_EQUAL(3, ((pair_type)d_result[0]).second - b.begin());
   
   b[3] = 4;
   
   mismatch_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), d_result.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(4, ((pair_type)d_result[0]).first  - a.begin());
   ASSERT_EQUAL(4, ((pair_type)d_result[0]).second - b.begin());
 }
@@ -58,12 +73,12 @@ void TestMismatchDeviceDevice()
   TestMismatchDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestMismatchDeviceDevice);
+#endif
 
 
 void TestMismatchCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector a(4); Vector b(4);
   a[0] = 1; b[0] = 1;
diff --git a/testing/cuda/mismatch.mk b/testing/cuda/mismatch.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/mismatch.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/pair_sort.cu b/testing/cuda/pair_sort.cu
similarity index 72%
rename from testing/backend/cuda/pair_sort.cu
rename to testing/cuda/pair_sort.cu
index b6805de69..da23e4cb2 100644
--- a/testing/backend/cuda/pair_sort.cu
+++ b/testing/cuda/pair_sort.cu
@@ -4,16 +4,12 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator>
 __global__
-void stable_sort_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 is_supported)
+void stable_sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   thrust::stable_sort(exec, first, last);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -43,17 +39,14 @@ void TestPairStableSortDevice(ExecutionPolicy exec)
 
   thrust::device_vector<P> d_pairs = h_pairs;
 
-  thrust::device_vector<bool> is_supported(1);
-
-  stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), is_supported.begin());
+  stable_sort_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    // sort on the host
-    thrust::stable_sort(h_pairs.begin(), h_pairs.end());
+  // sort on the host
+  thrust::stable_sort(h_pairs.begin(), h_pairs.end());
 
-    ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
-  }
+  ASSERT_EQUAL_QUIET(h_pairs, d_pairs);
 };
 
 
@@ -69,4 +62,5 @@ void TestPairStableSortDeviceDevice()
   TestPairStableSortDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPairStableSortDeviceDevice);
+#endif
 
diff --git a/testing/cuda/pair_sort.mk b/testing/cuda/pair_sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/pair_sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/pair_sort_by_key.cu b/testing/cuda/pair_sort_by_key.cu
similarity index 77%
rename from testing/backend/cuda/pair_sort_by_key.cu
rename to testing/cuda/pair_sort_by_key.cu
index 7c8363428..fa229b8a6 100644
--- a/testing/backend/cuda/pair_sort_by_key.cu
+++ b/testing/cuda/pair_sort_by_key.cu
@@ -6,16 +6,12 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
-void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 is_supported)
+void stable_sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   thrust::stable_sort_by_key(exec, keys_first, keys_last, values_first);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -51,19 +47,16 @@ void TestPairStableSortByKeyDevice(ExecutionPolicy exec)
   thrust::device_vector<P>   d_pairs = h_pairs;
   thrust::device_vector<int> d_values = h_values;
 
-  thrust::device_vector<bool> is_supported(1);
-
   // sort on the device
-  stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin(), is_supported.begin());
+  stable_sort_by_key_kernel<<<1,1>>>(exec, d_pairs.begin(), d_pairs.end(), d_values.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    // sort on the host
-    thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
+  // sort on the host
+  thrust::stable_sort_by_key(h_pairs.begin(), h_pairs.end(), h_values.begin());
 
-    ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
-    ASSERT_EQUAL(h_values, d_values);
-  }
+  ASSERT_EQUAL_QUIET(h_pairs,  d_pairs);
+  ASSERT_EQUAL(h_values, d_values);
 };
 
 
@@ -79,4 +72,5 @@ void TestPairStableSortByKeyDeviceDevice()
   TestPairStableSortByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPairStableSortByKeyDeviceDevice);
+#endif
 
diff --git a/testing/cuda/pair_sort_by_key.mk b/testing/cuda/pair_sort_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/pair_sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/partition.cu b/testing/cuda/partition.cu
similarity index 80%
rename from testing/backend/cuda/partition.cu
rename to testing/cuda/partition.cu
index 7db39a798..f8701db6f 100644
--- a/testing/backend/cuda/partition.cu
+++ b/testing/cuda/partition.cu
@@ -4,14 +4,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
-__global__
-void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
-{
-  *result = thrust::partition(exec, first, last, pred);
-}
-
-
 template<typename T>
 struct is_even
 {
@@ -20,6 +12,15 @@ struct is_even
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
+__global__
+void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
+{
+  *result = thrust::partition(exec, first, last, pred);
+}
+
+
 template<typename ExecutionPolicy>
 void TestPartitionDevice(ExecutionPolicy exec)
 {
@@ -36,6 +37,8 @@ void TestPartitionDevice(ExecutionPolicy exec)
   thrust::device_vector<iterator> result(1);
   
   partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   thrust::device_vector<T> ref(5);
   ref[0] = 2;
@@ -63,6 +66,13 @@ void TestPartitionDeviceDevice()
 DECLARE_UNITTEST(TestPartitionDeviceDevice);
 
 
+void TestPartitionDeviceNoSync()
+{
+  TestPartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__
 void partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
@@ -94,6 +104,8 @@ void TestPartitionStencilDevice(ExecutionPolicy exec)
   thrust::device_vector<iterator> result(1);
   
   partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   thrust::device_vector<T> ref(5);
   ref[0] = 1;
@@ -121,6 +133,13 @@ void TestPartitionStencilDeviceDevice()
 DECLARE_UNITTEST(TestPartitionStencilDeviceDevice);
 
 
+void TestPartitionStencilDeviceNoSync()
+{
+  TestPartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -149,6 +168,8 @@ void TestPartitionCopyDevice(ExecutionPolicy exec)
   thrust::device_vector<pair_type> iterators(1);
   
   partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   thrust::device_vector<T> true_ref(2);
   true_ref[0] =  2;
@@ -182,6 +203,13 @@ void TestPartitionCopyDeviceDevice()
 DECLARE_UNITTEST(TestPartitionCopyDeviceDevice);
 
 
+void TestPartitionCopyDeviceNoSync()
+{
+  TestPartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -217,6 +245,8 @@ void TestPartitionCopyStencilDevice(ExecutionPolicy exec)
   thrust::device_vector<pair_type> iterators(1);
 
   partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   pair_type ends = iterators[0];
   
@@ -250,16 +280,18 @@ void TestPartitionCopyStencilDeviceDevice()
 DECLARE_UNITTEST(TestPartitionCopyStencilDeviceDevice);
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2, typename Iterator3>
+void TestPartitionCopyStencilDeviceNoSync()
+{
+  TestPartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCopyStencilDeviceNoSync);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
 __global__
-void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result, Iterator3 is_supported)
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   *result = thrust::stable_partition(exec, first, last, pred);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -277,22 +309,20 @@ void TestStablePartitionDevice(ExecutionPolicy exec)
   data[4] = 2; 
 
   thrust::device_vector<iterator> result(1);
-  thrust::device_vector<bool> is_supported(1);
-  
-  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin(), is_supported.begin());
-  
-  if(is_supported[0])
-  {
-    thrust::device_vector<T> ref(5);
-    ref[0] = 2;
-    ref[1] = 2;
-    ref[2] = 1;
-    ref[3] = 1;
-    ref[4] = 1;
+
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> ref(5);
+  ref[0] = 2;
+  ref[1] = 2;
+  ref[2] = 1;
+  ref[3] = 1;
+  ref[4] = 1;
     
-    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
-    ASSERT_EQUAL(ref, data);
-  }
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
 }
 
 
@@ -310,16 +340,18 @@ void TestStablePartitionDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionDeviceDevice);
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3, typename Iterator4>
+void TestStablePartitionDeviceNoSync()
+{
+  TestStablePartitionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionDeviceNoSync);
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Predicate, typename Iterator3>
 __global__
-void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result, Iterator4 is_supported)
+void stable_partition_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Predicate pred, Iterator3 result)
 {
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
   *result = thrust::stable_partition(exec, first, last, stencil_first, pred);
-#else
-  *is_supported = false;
-#endif
 }
 
 
@@ -344,22 +376,20 @@ void TestStablePartitionStencilDevice(ExecutionPolicy exec)
   stencil[4] = 2; 
 
   thrust::device_vector<iterator> result(1);
-  thrust::device_vector<bool> is_supported(1);
-  
-  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin(), is_supported.begin());
-  
-  if(is_supported[0])
-  {
-    thrust::device_vector<T> ref(5);
-    ref[0] = 1;
-    ref[1] = 1;
-    ref[2] = 0;
-    ref[3] = 0;
-    ref[4] = 0;
+
+  stable_partition_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), is_even<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  thrust::device_vector<T> ref(5);
+  ref[0] = 1;
+  ref[1] = 1;
+  ref[2] = 0;
+  ref[3] = 0;
+  ref[4] = 0;
     
-    ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
-    ASSERT_EQUAL(ref, data);
-  }
+  ASSERT_EQUAL(2, (iterator)result[0] - data.begin());
+  ASSERT_EQUAL(ref, data);
 }
 
 
@@ -377,6 +407,13 @@ void TestStablePartitionStencilDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionStencilDeviceDevice);
 
 
+void TestStablePartitionStencilDeviceNoSync()
+{
+  TestStablePartitionStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionStencilDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Predicate, typename Iterator4>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 true_result, Iterator3 false_result, Predicate pred, Iterator4 result)
@@ -405,6 +442,8 @@ void TestStablePartitionCopyDevice(ExecutionPolicy exec)
   thrust::device_vector<pair_type> iterators(1);
   
   stable_partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   thrust::device_vector<T> true_ref(2);
   true_ref[0] =  2;
@@ -438,6 +477,13 @@ void TestStablePartitionCopyDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionCopyDeviceDevice);
 
 
+void TestStablePartitionCopyDeviceNoSync()
+{
+  TestStablePartitionCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyDeviceNoSync);
+
+
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Predicate, typename Iterator5>
 __global__
 void stable_partition_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 stencil_first, Iterator3 true_result, Iterator4 false_result, Predicate pred, Iterator5 result)
@@ -473,6 +519,8 @@ void TestStablePartitionCopyStencilDevice(ExecutionPolicy exec)
   thrust::device_vector<pair_type> iterators(1);
 
   stable_partition_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), stencil.begin(), true_results.begin(), false_results.begin(), is_even<T>(), iterators.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   pair_type ends = iterators[0];
   
@@ -506,11 +554,20 @@ void TestStablePartitionCopyStencilDeviceDevice()
 DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceDevice);
 
 
-void TestPartitionCudaStreams()
+void TestStablePartitionCopyStencilDeviceNoSync()
+{
+  TestStablePartitionCopyStencilDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestStablePartitionCopyStencilDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestPartitionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator   Iterator;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
   
   Vector data(5);
   data[0] = 1; 
@@ -521,8 +578,10 @@ void TestPartitionCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  Iterator iter = thrust::partition(thrust::cuda::par.on(s), data.begin(), data.end(), is_even<T>());
+  Iterator iter = thrust::partition(streampolicy, data.begin(), data.end(), is_even<T>());
   
   Vector ref(5);
   ref[0] = 2;
@@ -536,5 +595,17 @@ void TestPartitionCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestPartitionCudaStreams);
+
+void TestPartitionCudaStreamsSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsSync);
+
+
+void TestPartitionCudaStreamsNoSync()
+{
+  TestPartitionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestPartitionCudaStreamsNoSync);
 
diff --git a/testing/cuda/partition.mk b/testing/cuda/partition.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/partition.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/partition_point.cu b/testing/cuda/partition_point.cu
similarity index 91%
rename from testing/backend/cuda/partition_point.cu
rename to testing/cuda/partition_point.cu
index 1bc915749..57e4344ee 100644
--- a/testing/backend/cuda/partition_point.cu
+++ b/testing/cuda/partition_point.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Predicate, typename Iterator2>
 __global__
 void partition_point_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Predicate pred, Iterator2 result)
@@ -31,6 +32,8 @@ void TestPartitionPointDevice(ExecutionPolicy exec)
 
   thrust::device_vector<iterator> result(1);
   partition_point_kernel<<<1,1>>>(exec, v.begin(), v.end(), is_even<int>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   ASSERT_EQUAL(ref - v.begin(), (iterator)result[0] - v.begin());
 }
@@ -48,13 +51,14 @@ void TestPartitionPointDeviceDevice()
   TestPartitionPointDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestPartitionPointDeviceDevice);
+#endif
 
 
 void TestPartitionPointCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::value_type T;
+  typedef Vector::iterator Iterator;
 
   Vector v(4);
   v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
diff --git a/testing/cuda/partition_point.mk b/testing/cuda/partition_point.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/partition_point.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/reduce.cu b/testing/cuda/reduce.cu
similarity index 50%
rename from testing/backend/cuda/reduce.cu
rename to testing/cuda/reduce.cu
index dd8462fba..865d31c22 100644
--- a/testing/backend/cuda/reduce.cu
+++ b/testing/cuda/reduce.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/reduce.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/constant_iterator.h>
 
 
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
@@ -11,6 +12,7 @@ void reduce_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T init,
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename T, typename ExecutionPolicy>
 void TestReduceDevice(ExecutionPolicy exec, const size_t n)
 {
@@ -24,6 +26,8 @@ void TestReduceDevice(ExecutionPolicy exec, const size_t n)
   T h_result = thrust::reduce(h_data.begin(), h_data.end(), init);
   
   reduce_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), init, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_result, d_result[0]);
 }
@@ -51,10 +55,22 @@ struct TestReduceDeviceDevice
 VariableUnitTest<TestReduceDeviceDevice, IntegralTypes> TestReduceDeviceDeviceInstance;
 
 
-void TestReduceCudaStreams()
+template<typename T>
+struct TestReduceDeviceNoSync
+{
+  void operator()(const size_t n)
+  {
+    TestReduceDevice<T>(thrust::cuda::par_nosync, n);
+  }
+};
+VariableUnitTest<TestReduceDeviceNoSync, IntegralTypes> TestReduceDeviceNoSyncInstance;
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestReduceCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector v(3);
   v[0] = 1; v[1] = -2; v[2] = 3;
@@ -62,13 +78,46 @@ void TestReduceCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   // no initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end()), 2);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end()), 2);
 
   // with initializer
-  ASSERT_EQUAL(thrust::reduce(thrust::cuda::par.on(s), v.begin(), v.end(), 10), 12);
+  ASSERT_EQUAL(thrust::reduce(streampolicy, v.begin(), v.end(), 10), 12);
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceCudaStreams);
+
+void TestReduceCudaStreamsSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsSync);
+
+
+void TestReduceCudaStreamsNoSync()
+{
+  TestReduceCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceCudaStreamsNoSync);
+
+#if defined(THRUST_RDC_ENABLED)
+void TestReduceLargeInput()
+{
+  using T = unsigned long long;
+  using OffsetT = std::size_t;
+  const OffsetT num_items = 1ull << 32;
+
+  thrust::constant_iterator<T> d_data(T{1});
+  thrust::device_vector<T> d_result(1);
+
+  reduce_kernel<<<1,1>>>(thrust::device, d_data, d_data + num_items, T{}, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+  
+  ASSERT_EQUAL(num_items, d_result[0]);
+}
+DECLARE_UNITTEST(TestReduceLargeInput);
+#endif
 
diff --git a/testing/cuda/reduce.mk b/testing/cuda/reduce.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/reduce_by_key.cu b/testing/cuda/reduce_by_key.cu
similarity index 61%
rename from testing/backend/cuda/reduce_by_key.cu
rename to testing/cuda/reduce_by_key.cu
index dd65b56a2..20f44fb42 100644
--- a/testing/backend/cuda/reduce_by_key.cu
+++ b/testing/cuda/reduce_by_key.cu
@@ -1,8 +1,14 @@
-#include <unittest/unittest.h>
-#include <thrust/reduce.h>
+#include <thrust/equal.h>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/reduce.h>
+#include <unittest/unittest.h>
+
+#include <cstdint>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
 __global__
 void reduce_by_key_kernel(ExecutionPolicy exec,
@@ -43,6 +49,7 @@ void reduce_by_key_kernel(ExecutionPolicy exec,
 {
   *result = thrust::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, pred, binary_op);
 }
+#endif
 
 
 template<typename T>
@@ -85,6 +92,7 @@ void initialize_values(Vector& values)
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy>
 void TestReduceByKeyDevice(ExecutionPolicy exec)
 {
@@ -108,6 +116,11 @@ void TestReduceByKeyDevice(ExecutionPolicy exec)
   thrust::device_vector<T> output_values(values.size());
   
   reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
   
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -128,6 +141,11 @@ void TestReduceByKeyDevice(ExecutionPolicy exec)
   initialize_keys(keys);  initialize_values(values);
   
   reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
   
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
@@ -144,6 +162,11 @@ void TestReduceByKeyDevice(ExecutionPolicy exec)
   initialize_keys(keys);  initialize_values(values);
   
   reduce_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
   
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -176,15 +199,24 @@ void TestReduceByKeyDeviceDevice()
 DECLARE_UNITTEST(TestReduceByKeyDeviceDevice);
 
 
-void TestReduceByKeyCudaStreams()
+void TestReduceByKeyDeviceNoSync()
+{
+  TestReduceByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestReduceByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector keys;
   Vector values;
 
-  typename thrust::pair<typename Vector::iterator, typename Vector::iterator> new_last;
+  thrust::pair<Vector::iterator, Vector::iterator> new_last;
 
   // basic test
   initialize_keys(keys);  initialize_values(values);
@@ -195,7 +227,9 @@ void TestReduceByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -214,7 +248,7 @@ void TestReduceByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_reduce<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 3);
@@ -229,7 +263,7 @@ void TestReduceByKeyCudaStreams()
   // test BinaryFunction
   initialize_keys(keys);  initialize_values(values);
 
-  new_last = thrust::reduce_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
+  new_last = thrust::reduce_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), thrust::equal_to<T>(), thrust::plus<T>());
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
   ASSERT_EQUAL(new_last.second - output_values.begin(), 5);
@@ -247,5 +281,120 @@ void TestReduceByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestReduceByKeyCudaStreams);
 
+void TestReduceByKeyCudaStreamsSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsSync);
+
+
+void TestReduceByKeyCudaStreamsNoSync()
+{
+  TestReduceByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestReduceByKeyCudaStreamsNoSync);
+
+
+// Maps indices to key ids
+class div_op : public thrust::unary_function<std::int64_t, std::int64_t>
+{
+  std::int64_t m_divisor;
+
+public:
+  __host__ div_op(std::int64_t divisor)
+    : m_divisor(divisor)
+  {}
+
+  __host__ __device__
+  std::int64_t operator()(std::int64_t x) const
+  {
+    return x / m_divisor;
+  }
+};
+
+// Produces unique sequence for key
+class mod_op : public thrust::unary_function<std::int64_t, std::int64_t>
+{
+  std::int64_t m_divisor;
+
+public:
+  __host__ mod_op(std::int64_t divisor)
+    : m_divisor(divisor)
+  {}
+
+  __host__ __device__
+  std::int64_t operator()(std::int64_t x) const
+  {
+    // div: 2          
+    // idx: 0 1   2 3   4 5 
+    // key: 0 0 | 1 1 | 2 2 
+    // mod: 0 1 | 0 1 | 0 1
+    // ret: 0 1   1 2   2 3
+    return (x % m_divisor) + (x / m_divisor);
+  }
+};
+
+
+void TestReduceByKeyWithBigIndexesHelper(int magnitude)
+{
+  const std::int64_t key_size_magnitude = 8;
+  ASSERT_EQUAL(true, key_size_magnitude < magnitude);
+
+  const std::int64_t num_items       = 1ll << magnitude;
+  const std::int64_t num_unique_keys = 1ll << key_size_magnitude;
+
+  // Size of each key group
+  const std::int64_t key_size = num_items / num_unique_keys;
+
+  using counting_it      = thrust::counting_iterator<std::int64_t>;
+  using transform_key_it = thrust::transform_iterator<div_op, counting_it>;
+  using transform_val_it = thrust::transform_iterator<mod_op, counting_it>;
+
+  counting_it count_begin(0ll);
+  counting_it count_end = count_begin + num_items;
+  ASSERT_EQUAL(static_cast<std::int64_t>(thrust::distance(count_begin, count_end)),
+               num_items);
+
+  transform_key_it keys_begin(count_begin, div_op{key_size});
+  transform_key_it keys_end(count_end, div_op{key_size});
+
+  transform_val_it values_begin(count_begin, mod_op{key_size});
+
+  thrust::device_vector<std::int64_t> output_keys(num_unique_keys);
+  thrust::device_vector<std::int64_t> output_values(num_unique_keys);
+
+  // example:
+  //  items:        6
+  //  unique_keys:  2
+  //  key_size:     3
+  //  keys:         0 0 0 | 1 1 1 
+  //  values:       0 1 2 | 1 2 3
+  //  result:       3       6     = sum(range(key_size)) + key_size * key_id
+  thrust::reduce_by_key(keys_begin,
+                        keys_end,
+                        values_begin,
+                        output_keys.begin(),
+                        output_values.begin());
+
+  ASSERT_EQUAL(
+    true,
+    thrust::equal(output_keys.begin(), output_keys.end(), count_begin));
+
+  thrust::host_vector<std::int64_t> result = output_values;
+
+  const std::int64_t sum = (key_size - 1) * key_size / 2;
+  for (std::int64_t key_id = 0; key_id < num_unique_keys; key_id++)
+  {
+    ASSERT_EQUAL(result[key_id], sum + key_id * key_size);
+  }
+}
+
+void TestReduceByKeyWithBigIndexes()
+{
+  TestReduceByKeyWithBigIndexesHelper(30);
+  TestReduceByKeyWithBigIndexesHelper(31);
+  TestReduceByKeyWithBigIndexesHelper(32);
+  TestReduceByKeyWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestReduceByKeyWithBigIndexes);
diff --git a/testing/cuda/reduce_by_key.mk b/testing/cuda/reduce_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reduce_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/remove.cu b/testing/cuda/remove.cu
similarity index 82%
rename from testing/backend/cuda/remove.cu
rename to testing/cuda/remove.cu
index 9f12be568..0331c24b8 100644
--- a/testing/backend/cuda/remove.cu
+++ b/testing/cuda/remove.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T, typename Iterator2>
 __global__
 void remove_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val, Iterator2 result)
@@ -49,6 +50,7 @@ void remove_copy_if_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last
 {
   *result_end = thrust::remove_copy_if(exec, first, last, stencil_first, result, pred);
 }
+#endif
 
 
 template<typename T>
@@ -69,6 +71,7 @@ struct is_true
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy>
 void TestRemoveDevice(ExecutionPolicy exec)
 {
@@ -80,7 +83,11 @@ void TestRemoveDevice(ExecutionPolicy exec)
   thrust::device_vector<iterator> d_result(1);
   
   size_t h_size = thrust::remove(h_data.begin(), h_data.end(), 0) - h_data.begin();
+
   remove_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), 0, d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_result[0] - d_data.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -117,7 +124,11 @@ void TestRemoveIfDevice(ExecutionPolicy exec)
   thrust::device_vector<iterator> d_result(1);
   
   size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true<int>()) - h_data.begin();
+
   remove_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), is_true<int>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_result[0] - d_data.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -159,6 +170,9 @@ void TestRemoveIfStencilDevice(ExecutionPolicy exec)
   size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true<int>()) - h_data.begin();
 
   remove_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), is_true<int>(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_result[0] - d_data.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -200,6 +214,9 @@ void TestRemoveCopyDevice(ExecutionPolicy exec)
   size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), 0) - h_result.begin();
 
   remove_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), 0, d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_new_end[0] - d_result.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -241,6 +258,9 @@ void TestRemoveCopyIfDevice(ExecutionPolicy exec)
   size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<int>()) - h_result.begin();
 
   remove_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin(), is_true<int>(), d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_new_end[0] - d_result.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -285,6 +305,9 @@ void TestRemoveCopyIfStencilDevice(ExecutionPolicy exec)
   size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true<int>()) - h_result.begin();
 
   remove_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true<int>(), d_new_end.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   size_t d_size = (iterator)d_new_end[0] - d_result.begin();
   
   ASSERT_EQUAL(h_size, d_size);
@@ -308,12 +331,13 @@ void TestRemoveCopyIfStencilDeviceDevice()
   TestRemoveCopyIfStencilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestRemoveCopyIfStencilDeviceDevice);
+#endif
 
 
 void TestRemoveCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -325,10 +349,10 @@ void TestRemoveCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove(thrust::cuda::par.on(s),
-                                                 data.begin(), 
-                                                 data.end(), 
-                                                 (T) 2);
+  Vector::iterator end = thrust::remove(thrust::cuda::par.on(s),
+                                        data.begin(), 
+                                        data.end(), 
+                                        (T) 2);
 
   ASSERT_EQUAL(end - data.begin(), 3);
 
@@ -344,7 +368,7 @@ DECLARE_UNITTEST(TestRemoveCudaStreams);
 void TestRemoveCopyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -358,11 +382,11 @@ void TestRemoveCopyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove_copy(thrust::cuda::par.on(s),
-                                                      data.begin(), 
-                                                      data.end(), 
-                                                      result.begin(), 
-                                                      (T) 2);
+  Vector::iterator end = thrust::remove_copy(thrust::cuda::par.on(s),
+                                             data.begin(), 
+                                             data.end(), 
+                                             result.begin(), 
+                                             (T) 2);
 
   ASSERT_EQUAL(end - result.begin(), 3);
 
@@ -378,7 +402,7 @@ DECLARE_UNITTEST(TestRemoveCopyCudaStreams);
 void TestRemoveIfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -390,10 +414,10 @@ void TestRemoveIfCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
-                                                    data.begin(), 
-                                                    data.end(), 
-                                                    is_even<T>());
+  Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
+                                           data.begin(), 
+                                           data.end(), 
+                                           is_even<T>());
 
   ASSERT_EQUAL(end - data.begin(), 3);
 
@@ -409,7 +433,7 @@ DECLARE_UNITTEST(TestRemoveIfCudaStreams);
 void TestRemoveIfStencilCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -428,11 +452,11 @@ void TestRemoveIfStencilCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
-                                                    data.begin(), 
-                                                    data.end(),
-                                                    stencil.begin(),
-                                                    thrust::identity<T>());
+  Vector::iterator end = thrust::remove_if(thrust::cuda::par.on(s),
+                                           data.begin(), 
+                                           data.end(),
+                                           stencil.begin(),
+                                           thrust::identity<T>());
 
   ASSERT_EQUAL(end - data.begin(), 3);
 
@@ -448,7 +472,7 @@ DECLARE_UNITTEST(TestRemoveIfStencilCudaStreams);
 void TestRemoveCopyIfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -462,11 +486,11 @@ void TestRemoveCopyIfCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
-                                                         data.begin(), 
-                                                         data.end(), 
-                                                         result.begin(), 
-                                                         is_even<T>());
+  Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
+                                                data.begin(), 
+                                                data.end(), 
+                                                result.begin(), 
+                                                is_even<T>());
 
   ASSERT_EQUAL(end - result.begin(), 3);
 
@@ -482,7 +506,7 @@ DECLARE_UNITTEST(TestRemoveCopyIfCudaStreams);
 void TestRemoveCopyIfStencilCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
@@ -503,12 +527,12 @@ void TestRemoveCopyIfStencilCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  typename Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
-                                                         data.begin(), 
-                                                         data.end(), 
-                                                         stencil.begin(),
-                                                         result.begin(), 
-                                                         thrust::identity<T>());
+  Vector::iterator end = thrust::remove_copy_if(thrust::cuda::par.on(s),
+                                                data.begin(), 
+                                                data.end(), 
+                                                stencil.begin(),
+                                                result.begin(), 
+                                                thrust::identity<T>());
 
   ASSERT_EQUAL(end - result.begin(), 3);
 
diff --git a/testing/cuda/remove.mk b/testing/cuda/remove.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/remove.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/replace.cu b/testing/cuda/replace.cu
similarity index 93%
rename from testing/backend/cuda/replace.cu
rename to testing/cuda/replace.cu
index beb622c6b..bb8b7faa9 100644
--- a/testing/backend/cuda/replace.cu
+++ b/testing/cuda/replace.cu
@@ -10,6 +10,7 @@ struct less_than_five
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T1, typename T2>
 __global__
 void replace_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T1 old_value, T2 new_value)
@@ -28,7 +29,10 @@ void TestReplaceDevice(ExecutionPolicy exec, const size_t n)
   T new_value = 1;
   
   thrust::replace(h_data.begin(), h_data.end(), old_value, new_value);
+
   replace_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), old_value, new_value);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
 }
@@ -71,7 +75,10 @@ void TestReplaceCopyDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_dest(n);
   
   thrust::replace_copy(h_data.begin(), h_data.end(), h_dest.begin(), old_value, new_value);
+
   replace_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_dest.begin(), old_value, new_value);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
   ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -106,7 +113,10 @@ void TestReplaceIfDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_data = h_data;
   
   thrust::replace_if(h_data.begin(), h_data.end(), less_than_five<int>(), 0);
+
   replace_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
 }
@@ -143,7 +153,10 @@ void TestReplaceIfStencilDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_stencil = h_stencil;
   
   thrust::replace_if(h_data.begin(), h_data.end(), h_stencil.begin(), less_than_five<int>(), 0);
+
   replace_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
 }
@@ -180,7 +193,10 @@ void TestReplaceCopyIfDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_dest(n);
   
   thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<int>(), 0);
+
   replace_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
   ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -221,7 +237,10 @@ void TestReplaceCopyIfStencilDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_dest(n);
   
   thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<int>(), 0);
+
   replace_copy_if_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<int>(), 0);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_ALMOST_EQUAL(h_data, d_data);
   ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -240,12 +259,13 @@ void TestReplaceCopyIfStencilDeviceDevice()
   TestReplaceCopyIfStencilDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestReplaceCopyIfStencilDeviceDevice);
+#endif
 
 
 void TestReplaceCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
   Vector data(5);
   data[0] =  1; 
diff --git a/testing/cuda/replace.mk b/testing/cuda/replace.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/replace.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/reverse.cu b/testing/cuda/reverse.cu
similarity index 93%
rename from testing/backend/cuda/reverse.cu
rename to testing/cuda/reverse.cu
index 4344263fb..f6599ed61 100644
--- a/testing/backend/cuda/reverse.cu
+++ b/testing/cuda/reverse.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator>
 __global__
 void reverse_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
@@ -19,7 +20,10 @@ void TestReverseDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_data = h_data;
   
   thrust::reverse(h_data.begin(), h_data.end());
+
   reverse_kernel<<<1,1>>>(exec, raw_pointer_cast(d_data.data()), raw_pointer_cast(d_data.data() + d_data.size()));
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_data, d_data);
 };
@@ -58,7 +62,10 @@ void TestReverseCopyDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_result(n);
 
   thrust::reverse_copy(h_data.begin(), h_data.end(), h_result.begin());
+
   reverse_copy_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), d_result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   ASSERT_EQUAL(h_result, d_result);
 };
@@ -76,6 +83,7 @@ void TestReverseCopyDeviceDevice()
   TestReverseCopyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestReverseCopyDeviceDevice);
+#endif
 
 
 void TestReverseCudaStreams()
diff --git a/testing/cuda/reverse.mk b/testing/cuda/reverse.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/reverse.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/scan.cu b/testing/cuda/scan.cu
similarity index 72%
rename from testing/backend/cuda/scan.cu
rename to testing/cuda/scan.cu
index 4bcde6e87..5a19798cd 100644
--- a/testing/backend/cuda/scan.cu
+++ b/testing/cuda/scan.cu
@@ -4,6 +4,7 @@
 #include <thrust/functional.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -38,29 +39,59 @@ void TestScanDevice(ExecutionPolicy exec, const size_t n)
   thrust::device_vector<T> d_output(n);
   
   thrust::inclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+
   inclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_output, h_output);
   
   thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin());
+
   exclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_output, h_output);
   
   thrust::exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), (T) 11);
+
   exclusive_scan_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_output.begin(), (T) 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_output, h_output);
   
   // in-place scans
   h_output = h_input;
   d_output = d_input;
+
   thrust::inclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+
   inclusive_scan_kernel<<<1,1>>>(exec, d_output.begin(), d_output.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ASSERT_EQUAL(d_output, h_output);
   
   h_output = h_input;
   d_output = d_input;
   
   thrust::exclusive_scan(h_output.begin(), h_output.end(), h_output.begin());
+
   exclusive_scan_kernel<<<1,1>>>(exec, d_output.begin(), d_output.end(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(d_output, h_output);
 }
@@ -86,14 +117,15 @@ struct TestScanDeviceDevice
   }
 };
 VariableUnitTest<TestScanDeviceDevice, IntegralTypes> TestScanDeviceDeviceInstance;
+#endif
 
 
 void TestScanCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
   
-  typename Vector::iterator iter;
+  Vector::iterator iter;
 
   Vector input(5);
   Vector result(5);
@@ -111,7 +143,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -120,7 +152,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -129,7 +161,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -138,7 +170,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
 
@@ -147,7 +179,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
 
@@ -157,7 +189,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   // inplace exclusive scan with init
@@ -166,7 +198,7 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   // inplace exclusive scan with implicit init=0
@@ -175,10 +207,55 @@ void TestScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestScanCudaStreams);
 
+template <typename T>
+struct const_ref_plus_mod3
+{
+    T * table;
+
+    const_ref_plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    const T& operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+static void TestInclusiveScanWithConstAccumulator(void)
+{
+    // add numbers modulo 3 with external lookup table
+    thrust::device_vector<int> data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    thrust::device_vector<int> table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3<int>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], 0);
+    ASSERT_EQUAL(data[1], 1);
+    ASSERT_EQUAL(data[2], 0);
+    ASSERT_EQUAL(data[3], 1);
+    ASSERT_EQUAL(data[4], 0);
+    ASSERT_EQUAL(data[5], 0);
+    ASSERT_EQUAL(data[6], 1);
+}
+DECLARE_UNITTEST(TestInclusiveScanWithConstAccumulator);
diff --git a/testing/cuda/scan.mk b/testing/cuda/scan.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/scan_by_key.cu b/testing/cuda/scan_by_key.cu
similarity index 82%
rename from testing/backend/cuda/scan_by_key.cu
rename to testing/cuda/scan_by_key.cu
index cc6e36ce4..0fea161d7 100644
--- a/testing/backend/cuda/scan_by_key.cu
+++ b/testing/cuda/scan_by_key.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void inclusive_scan_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
@@ -36,7 +37,7 @@ void TestScanByKeyDevice(ExecutionPolicy exec)
   thrust::host_vector<int> h_keys(n);
   for(size_t i = 0, k = 0; i < n; i++)
   {
-    h_keys[i] = k;
+    h_keys[i] = static_cast<int>(k);
     if(rand() % 10 == 0)
     {
       k++;
@@ -56,28 +57,66 @@ void TestScanByKeyDevice(ExecutionPolicy exec)
   
   thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
   inclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   ASSERT_EQUAL(d_output, h_output);
   
   thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
   exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   ASSERT_EQUAL(d_output, h_output);
   
   thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), 11);
   exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   ASSERT_EQUAL(d_output, h_output);
   
-  // in-place scans
+  // in-place scans: in/out values aliasing
   h_output = h_vals;
   d_output = d_vals;
   thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
   inclusive_scan_by_key_kernel<<<1,1>>>(exec,d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   ASSERT_EQUAL(d_output, h_output);
   
   h_output = h_vals;
   d_output = d_vals;
   thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), 11);
   exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: keys/values aliasing
+  thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
+  inclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_keys, h_output);
+
+  d_keys = h_keys;
+  thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), 11);
+  exclusive_scan_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_vals.begin(), d_keys.begin(), 11);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+  ASSERT_EQUAL(d_keys, h_output);
 }
 
 
@@ -93,13 +132,14 @@ void TestScanByKeyDeviceDevice()
   TestScanByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestScanByKeyDeviceDevice);
+#endif
 
 
 void TestInclusiveScanByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator   Iterator;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
 
   Vector keys(7);
   Vector vals(7);
@@ -160,8 +200,8 @@ DECLARE_UNITTEST(TestInclusiveScanByKeyCudaStreams);
 void TestExclusiveScanByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator   Iterator;
+  typedef Vector::value_type T;
+  typedef Vector::iterator   Iterator;
 
   Vector keys(7);
   Vector vals(7);
diff --git a/testing/cuda/scan_by_key.mk b/testing/cuda/scan_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scan_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/scatter.cu b/testing/cuda/scatter.cu
similarity index 95%
rename from testing/backend/cuda/scatter.cu
rename to testing/cuda/scatter.cu
index 802af1257..92e7f342a 100644
--- a/testing/backend/cuda/scatter.cu
+++ b/testing/cuda/scatter.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 #include <algorithm>
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void scatter_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 map_first, Iterator3 result)
@@ -33,7 +34,10 @@ void TestScatterDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_output(output_size, 0);
   
   thrust::scatter(h_input.begin(), h_input.end(), h_map.begin(), h_output.begin());
+
   scatter_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_map.begin(), d_output.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_output, d_output);
 }
@@ -88,7 +92,10 @@ void TestScatterIfDevice(ExecutionPolicy exec)
   thrust::device_vector<int> d_output(output_size, 0);
   
   thrust::scatter_if(h_input.begin(), h_input.end(), h_map.begin(), h_map.begin(), h_output.begin(), is_even_scatter_if<unsigned int>());
+
   scatter_if_kernel<<<1,1>>>(exec, d_input.begin(), d_input.end(), d_map.begin(), d_map.begin(), d_output.begin(), is_even_scatter_if<unsigned int>());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(h_output, d_output);
 }
@@ -106,12 +113,12 @@ void TestScatterIfDeviceDevice()
   TestScatterIfDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestScatterIfDeviceDevice);
+#endif
 
 
 void TestScatterCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector map(5);  // scatter indices
   Vector src(5);  // source vector
@@ -145,7 +152,6 @@ DECLARE_UNITTEST(TestScatterCudaStreams);
 void TestScatterIfCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector flg(5);  // predicate array
   Vector map(5);  // scatter indices
diff --git a/testing/cuda/scatter.mk b/testing/cuda/scatter.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/scatter.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/sequence.cu b/testing/cuda/sequence.cu
similarity index 88%
rename from testing/backend/cuda/sequence.cu
rename to testing/cuda/sequence.cu
index a69dc2b63..16b2d799b 100644
--- a/testing/backend/cuda/sequence.cu
+++ b/testing/cuda/sequence.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator>
 __global__
 void sequence_kernel(ExecutionPolicy exec, Iterator first, Iterator last)
@@ -33,7 +34,11 @@ void TestSequenceDevice(ExecutionPolicy exec)
   thrust::device_vector<int> v(5);
   
   sequence_kernel<<<1,1>>>(exec, v.begin(), v.end());
-  
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+ 
   ASSERT_EQUAL(v[0], 0);
   ASSERT_EQUAL(v[1], 1);
   ASSERT_EQUAL(v[2], 2);
@@ -41,6 +46,10 @@ void TestSequenceDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(v[4], 4);
   
   sequence_kernel<<<1,1>>>(exec, v.begin(), v.end(), 10);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], 10);
   ASSERT_EQUAL(v[1], 11);
@@ -49,6 +58,10 @@ void TestSequenceDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(v[4], 14);
   
   sequence_kernel<<<1,1>>>(exec, v.begin(), v.end(), 10, 2);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], 10);
   ASSERT_EQUAL(v[1], 12);
@@ -68,11 +81,11 @@ void TestSequenceDeviceDevice()
   TestSequenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSequenceDeviceDevice);
+#endif
 
 void TestSequenceCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector v(5);
 
diff --git a/testing/cuda/sequence.mk b/testing/cuda/sequence.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sequence.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_difference.cu b/testing/cuda/set_difference.cu
similarity index 92%
rename from testing/backend/cuda/set_difference.cu
rename to testing/cuda/set_difference.cu
index 4849edd5c..bd9da131f 100644
--- a/testing/backend/cuda/set_difference.cu
+++ b/testing/cuda/set_difference.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_difference_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator2 last2, Iterator3 result1, Iterator4 result2)
@@ -30,6 +31,8 @@ void TestSetDifferenceDevice(ExecutionPolicy exec)
   thrust::device_vector<Iterator> end_vec(1);
 
   set_difference_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), b.end(), result.begin(), end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   Iterator end = end_vec.front();
 
@@ -50,12 +53,13 @@ void TestSetDifferenceDeviceDevice()
   TestSetDifferenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetDifferenceDeviceDevice);
+#endif
 
 
 void TestSetDifferenceCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(4), b(5);
 
diff --git a/testing/cuda/set_difference.mk b/testing/cuda/set_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_difference_by_key.cu b/testing/cuda/set_difference_by_key.cu
similarity index 96%
rename from testing/backend/cuda/set_difference_by_key.cu
rename to testing/cuda/set_difference_by_key.cu
index 6c250e654..2c32466f1 100644
--- a/testing/backend/cuda/set_difference_by_key.cu
+++ b/testing/cuda/set_difference_by_key.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_difference_by_key_kernel(ExecutionPolicy exec,
@@ -58,6 +59,8 @@ void TestSetDifferenceByKeyDevice(ExecutionPolicy exec)
                                         result_key.begin(),
                                         result_val.begin(),
                                         end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   iter_pair end = end_vec.front();
 
@@ -80,12 +83,13 @@ void TestSetDifferenceByKeyDeviceDevice()
   TestSetDifferenceByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetDifferenceByKeyDeviceDevice);
+#endif
 
 
 void TestSetDifferenceByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a_key(4), b_key(5);
   Vector a_val(4), b_val(5);
diff --git a/testing/cuda/set_difference_by_key.mk b/testing/cuda/set_difference_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_intersection.cu b/testing/cuda/set_intersection.cu
similarity index 70%
rename from testing/backend/cuda/set_intersection.cu
rename to testing/cuda/set_intersection.cu
index 948142887..2bb30ea87 100644
--- a/testing/backend/cuda/set_intersection.cu
+++ b/testing/cuda/set_intersection.cu
@@ -6,6 +6,7 @@
 #include <thrust/iterator/discard_iterator.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_intersection_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1,
@@ -21,7 +22,7 @@ template<typename ExecutionPolicy>
 void TestSetIntersectionDevice(ExecutionPolicy exec)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(3), b(4);
 
@@ -35,6 +36,9 @@ void TestSetIntersectionDevice(ExecutionPolicy exec)
   thrust::device_vector<Iterator> end_vec(1);
 
   set_intersection_kernel<<<1,1>>>(exec, a.begin(), a.end(), b.begin(), b.end(), result.begin(), end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   Iterator end = end_vec.front();
 
   ASSERT_EQUAL_QUIET(result.end(), end);
@@ -56,10 +60,19 @@ void TestSetIntersectionDeviceDevice()
 DECLARE_UNITTEST(TestSetIntersectionDeviceDevice);
 
 
-void TestSetIntersectionCudaStreams()
+void TestSetIntersectionDeviceNoSync()
+{
+  TestSetIntersectionDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(3), b(4);
 
@@ -74,7 +87,9 @@ void TestSetIntersectionCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  Iterator end = thrust::set_intersection(thrust::cuda::par.on(s),
+  auto streampolicy = policy.on(s);
+
+  Iterator end = thrust::set_intersection(streampolicy,
                                           a.begin(), a.end(),
                                           b.begin(), b.end(),
                                           result.begin());
@@ -85,5 +100,17 @@ void TestSetIntersectionCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionCudaStreams);
+
+void TestSetIntersectionCudaStreamsSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsSync);
+
+
+void TestSetIntersectionCudaStreamsNoSync()
+{
+  TestSetIntersectionCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionCudaStreamsNoSync);
 
diff --git a/testing/cuda/set_intersection.mk b/testing/cuda/set_intersection.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_intersection.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_intersection_by_key.cu b/testing/cuda/set_intersection_by_key.cu
similarity index 80%
rename from testing/backend/cuda/set_intersection_by_key.cu
rename to testing/cuda/set_intersection_by_key.cu
index f6f0c979a..fed6cb6f6 100644
--- a/testing/backend/cuda/set_intersection_by_key.cu
+++ b/testing/cuda/set_intersection_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6>
 __global__
 void set_intersection_by_key_kernel(ExecutionPolicy exec,
@@ -47,6 +48,8 @@ void TestSetIntersectionByKeyDevice(ExecutionPolicy exec)
                                           result_key.begin(),
                                           result_val.begin(),
                                           end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   thrust::pair<Iterator,Iterator> end = end_vec.front();
 
@@ -71,10 +74,19 @@ void TestSetIntersectionByKeyDeviceDevice()
 DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceDevice);
 
 
-void TestSetIntersectionByKeyCudaStreams()
+void TestSetIntersectionByKeyDeviceNoSync()
+{
+  TestSetIntersectionByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestSetIntersectionByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a_key(3), b_key(4);
   Vector a_val(3);
@@ -93,8 +105,10 @@ void TestSetIntersectionByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
+  auto streampolicy = policy.on(s);
+
   thrust::pair<Iterator,Iterator> end =
-    thrust::set_intersection_by_key(thrust::cuda::par.on(s),
+    thrust::set_intersection_by_key(streampolicy,
                                     a_key.begin(), a_key.end(),
                                     b_key.begin(), b_key.end(),
                                     a_val.begin(),
@@ -109,5 +123,17 @@ void TestSetIntersectionByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreams);
+
+void TestSetIntersectionByKeyCudaStreamsSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsSync);
+
+
+void TestSetIntersectionByKeyCudaStreamsNoSync()
+{
+  TestSetIntersectionByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestSetIntersectionByKeyCudaStreamsNoSync);
 
diff --git a/testing/cuda/set_intersection_by_key.mk b/testing/cuda/set_intersection_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_intersection_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_symmetric_difference.cu b/testing/cuda/set_symmetric_difference.cu
similarity index 94%
rename from testing/backend/cuda/set_symmetric_difference.cu
rename to testing/cuda/set_symmetric_difference.cu
index 48ec9a5f4..43fc0e993 100644
--- a/testing/backend/cuda/set_symmetric_difference.cu
+++ b/testing/cuda/set_symmetric_difference.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_symmetric_difference_kernel(ExecutionPolicy exec,
@@ -37,6 +38,9 @@ void TestSetSymmetricDifferenceDevice(ExecutionPolicy exec)
                                            b.begin(), b.end(),
                                            result.begin(),
                                            end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   Iterator end = end_vec[0];
 
   ASSERT_EQUAL_QUIET(result.end(), end);
@@ -56,12 +60,13 @@ void TestSetSymmetricDifferenceDeviceDevice()
   TestSetSymmetricDifferenceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetSymmetricDifferenceDeviceDevice);
+#endif
 
 
 void TestSetSymmetricDifferenceCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(4), b(5);
 
diff --git a/testing/cuda/set_symmetric_difference.mk b/testing/cuda/set_symmetric_difference.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_symmetric_difference.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_symmetric_difference_by_key.cu b/testing/cuda/set_symmetric_difference_by_key.cu
similarity index 96%
rename from testing/backend/cuda/set_symmetric_difference_by_key.cu
rename to testing/cuda/set_symmetric_difference_by_key.cu
index 0b8677bdd..7e7adba5e 100644
--- a/testing/backend/cuda/set_symmetric_difference_by_key.cu
+++ b/testing/cuda/set_symmetric_difference_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_symmetric_difference_by_key_kernel(ExecutionPolicy exec,
@@ -50,6 +51,9 @@ void TestSetSymmetricDifferenceByKeyDevice(ExecutionPolicy exec)
                                                   result_key.begin(),
                                                   result_val.begin(),
                                                   end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   iter_pair end = end_vec[0];
 
   ASSERT_EQUAL_QUIET(result_key.end(), end.first);
@@ -71,12 +75,13 @@ void TestSetSymmetricDifferenceByKeyDeviceDevice()
   TestSetSymmetricDifferenceByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetSymmetricDifferenceByKeyDeviceDevice);
+#endif
 
 
 void TestSetSymmetricDifferenceByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a_key(4), b_key(5);
   Vector a_val(4), b_val(5);
diff --git a/testing/cuda/set_symmetric_difference_by_key.mk b/testing/cuda/set_symmetric_difference_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_symmetric_difference_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_union.cu b/testing/cuda/set_union.cu
similarity index 93%
rename from testing/backend/cuda/set_union.cu
rename to testing/cuda/set_union.cu
index a7975bdf4..058f0e700 100644
--- a/testing/backend/cuda/set_union.cu
+++ b/testing/cuda/set_union.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4>
 __global__
 void set_union_kernel(ExecutionPolicy exec,
@@ -37,6 +38,9 @@ void TestSetUnionDevice(ExecutionPolicy exec)
                             b.begin(), b.end(),
                             result.begin(),
                             end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   Iterator end = end_vec[0];
 
   ASSERT_EQUAL_QUIET(result.end(), end);
@@ -56,12 +60,13 @@ void TestSetUnionDeviceDevice()
   TestSetUnionDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetUnionDeviceDevice);
+#endif
 
 
 void TestSetUnionCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a(3), b(4);
 
diff --git a/testing/cuda/set_union.mk b/testing/cuda/set_union.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_union.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/set_union_by_key.cu b/testing/cuda/set_union_by_key.cu
similarity index 96%
rename from testing/backend/cuda/set_union_by_key.cu
rename to testing/cuda/set_union_by_key.cu
index 0f26397ad..013ebe11b 100644
--- a/testing/backend/cuda/set_union_by_key.cu
+++ b/testing/cuda/set_union_by_key.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5, typename Iterator6, typename Iterator7>
 __global__
 void set_union_by_key_kernel(ExecutionPolicy exec,
@@ -49,6 +50,9 @@ void TestSetUnionByKeyDevice(ExecutionPolicy exec)
                                    result_key.begin(),
                                    result_val.begin(),
                                    end_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   thrust::pair<Iterator,Iterator> end = end_vec[0];
 
   ASSERT_EQUAL_QUIET(result_key.end(), end.first);
@@ -70,12 +74,13 @@ void TestSetUnionByKeyDeviceDevice()
   TestSetUnionByKeyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSetUnionByKeyDeviceDevice);
+#endif
 
 
 void TestSetUnionByKeyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::iterator Iterator;
+  typedef Vector::iterator Iterator;
 
   Vector a_key(3), b_key(4);
   Vector a_val(3), b_val(4);
diff --git a/testing/cuda/set_union_by_key.mk b/testing/cuda/set_union_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/set_union_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/sort.cu b/testing/cuda/sort.cu
similarity index 87%
rename from testing/backend/cuda/sort.cu
rename to testing/cuda/sort.cu
index 901b71789..c3d5ff2bc 100644
--- a/testing/backend/cuda/sort.cu
+++ b/testing/cuda/sort.cu
@@ -4,19 +4,6 @@
 #include <thrust/execution_policy.h>
 
 
-template<typename ExecutionPolicy, typename Iterator, typename Compare, typename Iterator2>
-__global__
-void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp, Iterator2 is_supported)
-{
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
-  thrust::sort(exec, first, last, comp);
-#else
-  *is_supported = false;
-#endif
-}
-
-
 template<typename T>
 struct my_less
 {
@@ -28,21 +15,29 @@ struct my_less
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator, typename Compare>
+__global__
+void sort_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Compare comp)
+{
+  thrust::sort(exec, first, last, comp);
+}
+
+
 template<typename T, typename ExecutionPolicy, typename Compare>
 void TestComparisonSortDevice(ExecutionPolicy exec, const size_t n, Compare comp)
 {
   thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
   thrust::device_vector<T> d_data = h_data;
   
-  thrust::device_vector<bool> is_supported(1);
-  sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp, is_supported.begin());
+  sort_kernel<<<1,1>>>(exec, d_data.begin(), d_data.end(), comp);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
-  if(is_supported[0])
-  {
-    thrust::sort(h_data.begin(), h_data.end(), comp);
-    
-    ASSERT_EQUAL(h_data, d_data);
-  }
+
+  thrust::sort(h_data.begin(), h_data.end(), comp);
+
+  ASSERT_EQUAL(h_data, d_data);
 };
 
 
@@ -107,6 +102,7 @@ VariableUnitTest<
   TestSortDeviceDevice,
   unittest::type_list<unittest::int8_t,unittest::int32_t>
 > TestSortDeviceDeviceInstance;
+#endif
 
 
 void TestSortCudaStreams()
@@ -159,7 +155,7 @@ void TestComparisonSortCudaStreams()
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end(), my_less<int>()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestComparisonSortCudaStreams);
diff --git a/testing/cuda/sort.mk b/testing/cuda/sort.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sort.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/sort_by_key.cu b/testing/cuda/sort_by_key.cu
similarity index 87%
rename from testing/backend/cuda/sort_by_key.cu
rename to testing/cuda/sort_by_key.cu
index 463aeace9..ee2b44ea0 100644
--- a/testing/backend/cuda/sort_by_key.cu
+++ b/testing/cuda/sort_by_key.cu
@@ -4,19 +4,6 @@
 #include <thrust/functional.h>
 
 
-template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Compare, typename Iterator3>
-__global__
-void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp, Iterator3 is_supported)
-{
-#if (__CUDA_ARCH__ >= 200)
-  *is_supported = true;
-  thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
-#else
-  *is_supported = false;
-#endif
-}
-
-
 template<typename T>
 struct my_less
 {
@@ -28,6 +15,15 @@ struct my_less
 };
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Compare>
+__global__
+void sort_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Compare comp)
+{
+  thrust::sort_by_key(exec, keys_first, keys_last, values_first, comp);
+}
+
+
 template<typename T, typename ExecutionPolicy, typename Compare>
 void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare comp)
 {
@@ -36,17 +32,15 @@ void TestComparisonSortByKeyDevice(ExecutionPolicy exec, const size_t n, Compare
 
   thrust::host_vector<T>   h_values = h_keys;
   thrust::device_vector<T> d_values = d_keys;
-  
-  thrust::device_vector<bool> is_supported(1);
-  sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp, is_supported.begin());
 
-  if(is_supported[0])
-  {
-    thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp);
-    
-    ASSERT_EQUAL(h_keys, d_keys);
-    ASSERT_EQUAL(h_values, d_values);
-  }
+  sort_by_key_kernel<<<1,1>>>(exec, d_keys.begin(), d_keys.end(), d_values.begin(), comp);
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
+  thrust::sort_by_key(h_keys.begin(), h_keys.end(), h_values.begin(), comp);
+
+  ASSERT_EQUAL(h_keys, d_keys);
+  ASSERT_EQUAL(h_values, d_values);
 };
 
 
@@ -111,6 +105,7 @@ VariableUnitTest<
   TestSortByKeyDeviceDevice,
   unittest::type_list<unittest::int8_t,unittest::int32_t>
 > TestSortByKeyDeviceDeviceInstance;
+#endif
 
 
 void TestComparisonSortByKeyCudaStreams()
@@ -137,7 +132,7 @@ void TestComparisonSortByKeyCudaStreams()
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
   ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestComparisonSortByKeyCudaStreams);
@@ -167,7 +162,7 @@ void TestSortByKeyCudaStreams()
 
   ASSERT_EQUAL(true, thrust::is_sorted(keys.begin(), keys.end()));
   ASSERT_EQUAL(true, thrust::is_sorted(vals.begin(), vals.end()));
-                      
+
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestSortByKeyCudaStreams);
diff --git a/testing/cuda/sort_by_key.mk b/testing/cuda/sort_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/sort_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/cuda/stream_legacy.cu b/testing/cuda/stream_legacy.cu
new file mode 100644
index 000000000..51c82a096
--- /dev/null
+++ b/testing/cuda/stream_legacy.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamLegacy);
+}
+
+void TestLegacyDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestLegacyDefaultStream);
diff --git a/testing/cuda/stream_per_thread.cmake b/testing/cuda/stream_per_thread.cmake
new file mode 100644
index 000000000..2cea2f938
--- /dev/null
+++ b/testing/cuda/stream_per_thread.cmake
@@ -0,0 +1,13 @@
+# This test should always use per-thread streams on NVCC.
+set_target_properties(${test_target} PROPERTIES
+  COMPILE_OPTIONS
+    $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:--default-stream=per-thread>
+)
+
+thrust_fix_clang_nvcc_build_for(${test_target})
+
+# NVC++ does not have an equivalent option, and will always
+# use the global stream by default.
+if (CMAKE_CUDA_COMPILER_ID STREQUAL "Feta")
+  set_tests_properties(${test_target} PROPERTIES WILL_FAIL ON)
+endif()
diff --git a/testing/cuda/stream_per_thread.cu b/testing/cuda/stream_per_thread.cu
new file mode 100644
index 000000000..ef126e78a
--- /dev/null
+++ b/testing/cuda/stream_per_thread.cu
@@ -0,0 +1,21 @@
+#include <unittest/unittest.h>
+#include <thrust/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thread>
+
+void verify_stream()
+{
+  auto exec = thrust::device;
+  auto stream = thrust::cuda_cub::stream(exec);
+  ASSERT_EQUAL(stream, cudaStreamPerThread);
+}
+
+void TestPerThreadDefaultStream()
+{
+  verify_stream();
+
+  std::thread t(verify_stream);
+  t.join();
+}
+DECLARE_UNITTEST(TestPerThreadDefaultStream);
diff --git a/testing/cuda/stream_per_thread.mk b/testing/cuda/stream_per_thread.mk
new file mode 100644
index 000000000..da9adfe1b
--- /dev/null
+++ b/testing/cuda/stream_per_thread.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += --default-stream per-thread
diff --git a/testing/backend/cuda/swap_ranges.cu b/testing/cuda/swap_ranges.cu
similarity index 93%
rename from testing/backend/cuda/swap_ranges.cu
rename to testing/cuda/swap_ranges.cu
index ce353ee53..ebc396e83 100644
--- a/testing/backend/cuda/swap_ranges.cu
+++ b/testing/cuda/swap_ranges.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void swap_ranges_kernel(ExecutionPolicy exec, Iterator1 first1, Iterator1 last1, Iterator2 first2)
@@ -15,7 +16,6 @@ template<typename ExecutionPolicy>
 void TestSwapRangesDevice(ExecutionPolicy exec)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
@@ -24,6 +24,8 @@ void TestSwapRangesDevice(ExecutionPolicy exec)
   v2[0] = 5; v2[1] = 6; v2[2] = 7; v2[3] = 8; v2[4] = 9;
 
   swap_ranges_kernel<<<1,1>>>(exec, v1.begin(), v1.end(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   ASSERT_EQUAL(v1[0], 5);
   ASSERT_EQUAL(v1[1], 6);
@@ -49,11 +51,11 @@ void TestSwapRangesDeviceDevice()
   TestSwapRangesDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestSwapRangesDeviceDevice);
+#endif
 
 void TestSwapRangesCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
 
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
diff --git a/testing/cuda/swap_ranges.mk b/testing/cuda/swap_ranges.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/swap_ranges.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/tabulate.cu b/testing/cuda/tabulate.cu
similarity index 86%
rename from testing/backend/cuda/tabulate.cu
rename to testing/cuda/tabulate.cu
index 463bb49bf..b449fb7cc 100644
--- a/testing/backend/cuda/tabulate.cu
+++ b/testing/cuda/tabulate.cu
@@ -4,6 +4,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename Function>
 __global__
 void tabulate_kernel(ExecutionPolicy exec, Iterator first, Iterator last, Function f)
@@ -22,6 +23,10 @@ void TestTabulateDevice(ExecutionPolicy exec)
   Vector v(5);
 
   tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), thrust::identity<T>());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(v[0], 0);
   ASSERT_EQUAL(v[1], 1);
@@ -30,6 +35,10 @@ void TestTabulateDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(v[4], 4);
 
   tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), -_1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(v[0],  0);
   ASSERT_EQUAL(v[1], -1);
@@ -38,6 +47,10 @@ void TestTabulateDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(v[4], -4);
   
   tabulate_kernel<<<1,1>>>(exec, v.begin(), v.end(), _1 * _1 * _1);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
 
   ASSERT_EQUAL(v[0], 0);
   ASSERT_EQUAL(v[1], 1);
@@ -57,12 +70,13 @@ void TestTabulateDeviceDevice()
   TestTabulateDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTabulateDeviceDevice);
+#endif
 
 void TestTabulateCudaStreams()
 {
   using namespace thrust::placeholders;
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
   
   Vector v(5);
 
diff --git a/testing/cuda/tabulate.mk b/testing/cuda/tabulate.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/tabulate.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/transform.cu b/testing/cuda/transform.cu
similarity index 89%
rename from testing/backend/cuda/transform.cu
rename to testing/cuda/transform.cu
index dd2fa09d0..7739089e6 100644
--- a/testing/backend/cuda/transform.cu
+++ b/testing/cuda/transform.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function, typename Iterator3>
 __global__
 void transform_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function f, Iterator3 result2)
@@ -28,9 +29,12 @@ void TestTransformUnaryDevice(ExecutionPolicy exec)
   thrust::device_vector<typename Vector::iterator> iter_vec(1);
   
   transform_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   iter = iter_vec[0];
   
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(output, result);
 }
 
@@ -79,9 +83,12 @@ void TestTransformIfUnaryNoStencilDevice(ExecutionPolicy exec)
                                thrust::negate<T>(),
                                thrust::identity<T>(),
                                iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   iter = iter_vec[0];
   
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(output, result);
 }
 
@@ -133,10 +140,12 @@ void TestTransformIfUnaryDevice(ExecutionPolicy exec)
                                thrust::negate<T>(),
                                thrust::identity<T>(),
                                iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
 
   iter = iter_vec[0];
   
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(output, result);
 }
 
@@ -180,9 +189,12 @@ void TestTransformBinaryDevice(ExecutionPolicy exec)
   thrust::device_vector<typename Vector::iterator> iter_vec(1);
   
   transform_kernel<<<1,1>>>(exec, input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>(), iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   iter = iter_vec[0];
   
-  ASSERT_EQUAL(iter - output.begin(), input1.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
   ASSERT_EQUAL(output, result);
 }
 
@@ -239,9 +251,12 @@ void TestTransformIfBinaryDevice(ExecutionPolicy exec)
                                thrust::minus<T>(),
                                thrust::not1(identity),
                                iter_vec.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   iter = iter_vec[0];
   
-  ASSERT_EQUAL(iter - output.begin(), input1.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
   ASSERT_EQUAL(output, result);
 }
 
@@ -256,13 +271,14 @@ void TestTransformIfBinaryDeviceDevice()
   TestTransformIfBinaryDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformIfBinaryDeviceDevice);
+#endif
 
 void TestTransformUnaryCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
   
-  typename Vector::iterator iter;
+  Vector::iterator iter;
 
   Vector input(3);
   Vector output(3);
@@ -276,7 +292,7 @@ void TestTransformUnaryCudaStreams()
   iter = thrust::transform(thrust::cuda::par.on(s), input.begin(), input.end(), output.begin(), thrust::negate<T>());
   cudaStreamSynchronize(s);
   
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(output, result);
 
   cudaStreamDestroy(s);
@@ -287,9 +303,9 @@ DECLARE_UNITTEST(TestTransformUnaryCudaStreams);
 void TestTransformBinaryCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
-  typename Vector::iterator iter;
+  Vector::iterator iter;
 
   Vector input1(3);
   Vector input2(3);
@@ -305,7 +321,7 @@ void TestTransformBinaryCudaStreams()
   iter = thrust::transform(thrust::cuda::par.on(s), input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>());
   cudaStreamSynchronize(s);
   
-  ASSERT_EQUAL(iter - output.begin(), input1.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
   ASSERT_EQUAL(output, result);
 
   cudaStreamDestroy(s);
diff --git a/testing/cuda/transform.mk b/testing/cuda/transform.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/transform_reduce.cu b/testing/cuda/transform_reduce.cu
similarity index 91%
rename from testing/backend/cuda/transform_reduce.cu
rename to testing/cuda/transform_reduce.cu
index 06d176258..c55aa66e7 100644
--- a/testing/backend/cuda/transform_reduce.cu
+++ b/testing/cuda/transform_reduce.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Function1, typename T, typename Function2, typename Iterator2>
 __global__
 void transform_reduce_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Function1 f1, T init, Function2 f2, Iterator2 result)
@@ -25,6 +26,8 @@ void TestTransformReduceDevice(ExecutionPolicy exec)
   thrust::device_vector<T> result(1);
 
   transform_reduce_kernel<<<1,1>>>(exec, data.begin(), data.end(), thrust::negate<T>(), init, thrust::plus<T>(), result.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
   
   ASSERT_EQUAL(8, (T)result[0]);
 }
@@ -42,12 +45,13 @@ void TestTransformReduceDeviceDevice()
   TestTransformReduceDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformReduceDeviceDevice);
+#endif
 
 
 void TestTransformReduceCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
   
   Vector data(3);
   data[0] = 1; data[1] = -2; data[2] = 3;
diff --git a/testing/cuda/transform_reduce.mk b/testing/cuda/transform_reduce.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform_reduce.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/transform_scan.cu b/testing/cuda/transform_scan.cu
similarity index 73%
rename from testing/backend/cuda/transform_scan.cu
rename to testing/cuda/transform_scan.cu
index b27c598a8..de0d1524f 100644
--- a/testing/backend/cuda/transform_scan.cu
+++ b/testing/cuda/transform_scan.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Function1, typename Function2, typename Iterator3>
 __global__
 void transform_inclusive_scan_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Function1 f1, Function2 f2, Iterator3 result2)
@@ -39,41 +40,66 @@ void TestTransformScanDevice(ExecutionPolicy exec)
   
   // inclusive scan
   transform_inclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   ref[0] = -1; ref[1] = -4; ref[2] = -2; ref[3] = -6; ref[4] = -1;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(ref, output);
   
   // exclusive scan with 0 init
   transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   ref[0] = 0; ref[1] = -1; ref[2] = -4; ref[3] = -2; ref[4] = -6;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(ref, output);
   
   // exclusive scan with nonzero init
   transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   ref[0] = 3; ref[1] = 2; ref[2] = -1; ref[3] = 1; ref[4] = -3;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(ref, output);
   
   // inplace inclusive scan
   input = input_copy;
   transform_inclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   ref[0] = -1; ref[1] = -4; ref[2] = -2; ref[3] = -6; ref[4] = -1;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(ref, input);
   
   // inplace exclusive scan with init
   input = input_copy;
   transform_exclusive_scan_kernel<<<1,1>>>(exec, input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>(), iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   ref[0] = 3; ref[1] = 2; ref[2] = -1; ref[3] = 1; ref[4] = -3;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(ref, input);
 }
 
@@ -90,14 +116,15 @@ void TestTransformScanDeviceDevice()
   TestTransformScanDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestTransformScanDeviceDevice);
+#endif
 
 
 void TestTransformScanCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
+  typedef Vector::value_type T;
 
-  typename Vector::iterator iter;
+  Vector::iterator iter;
 
   Vector input(5);
   Vector result(5);
@@ -115,7 +142,7 @@ void TestTransformScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -124,7 +151,7 @@ void TestTransformScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 0; result[1] = -1; result[2] = -4; result[3] = -2; result[4] = -6;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -133,7 +160,7 @@ void TestTransformScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
-  ASSERT_EQUAL(iter - output.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
   ASSERT_EQUAL(input,  input_copy);
   ASSERT_EQUAL(output, result);
   
@@ -143,7 +170,7 @@ void TestTransformScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   // inplace exclusive scan with init
@@ -152,10 +179,37 @@ void TestTransformScanCudaStreams()
   cudaStreamSynchronize(s);
 
   result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
-  ASSERT_EQUAL(iter - input.begin(), input.size());
+  ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
   ASSERT_EQUAL(input, result);
 
   cudaStreamDestroy(s);
 }
 DECLARE_UNITTEST(TestTransformScanCudaStreams);
 
+void TestTransformScanConstAccumulator()
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector::iterator iter;
+
+  Vector input(5);
+  Vector reference(5);
+  Vector output(5);
+
+  input[0] = 1;
+  input[1] = 3;
+  input[2] = -2;
+  input[3] = 4;
+  input[4] = -5;
+
+  thrust::transform_inclusive_scan(input.begin(),
+                                   input.end(),
+                                   output.begin(),
+                                   thrust::identity<T>(),
+                                   thrust::plus<T>());
+  thrust::inclusive_scan(input.begin(), input.end(), reference.begin(), thrust::plus<T>());
+
+  ASSERT_EQUAL(output, reference);
+}
+DECLARE_UNITTEST(TestTransformScanConstAccumulator);
diff --git a/testing/cuda/transform_scan.mk b/testing/cuda/transform_scan.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/transform_scan.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/uninitialized_copy.cu b/testing/cuda/uninitialized_copy.cu
similarity index 92%
rename from testing/backend/cuda/uninitialized_copy.cu
rename to testing/cuda/uninitialized_copy.cu
index 3c8717b6e..735e2dac3 100644
--- a/testing/backend/cuda/uninitialized_copy.cu
+++ b/testing/cuda/uninitialized_copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void uninitialized_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -15,7 +16,6 @@ template<typename ExecutionPolicy>
 void TestUninitializedCopyDevice(ExecutionPolicy exec)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
@@ -23,6 +23,9 @@ void TestUninitializedCopyDevice(ExecutionPolicy exec)
   // copy to Vector
   Vector v2(5);
   uninitialized_copy_kernel<<<1,1>>>(exec, v1.begin(), v1.end(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   ASSERT_EQUAL(v2[0], 0);
   ASSERT_EQUAL(v2[1], 1);
   ASSERT_EQUAL(v2[2], 2);
@@ -43,12 +46,12 @@ void TestUninitializedCopyDeviceDevice()
   TestUninitializedCopyDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedCopyDeviceDevice);
+#endif
 
 
 void TestUninitializedCopyCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
@@ -73,6 +76,7 @@ void TestUninitializedCopyCudaStreams()
 DECLARE_UNITTEST(TestUninitializedCopyCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Size, typename Iterator2>
 __global__
 void uninitialized_copy_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, Iterator2 result)
@@ -85,7 +89,6 @@ template<typename ExecutionPolicy>
 void TestUninitializedCopyNDevice(ExecutionPolicy exec)
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
@@ -93,6 +96,9 @@ void TestUninitializedCopyNDevice(ExecutionPolicy exec)
   // copy to Vector
   Vector v2(5);
   uninitialized_copy_n_kernel<<<1,1>>>(exec, v1.begin(), v1.size(), v2.begin());
+  cudaError_t const err = cudaDeviceSynchronize();
+  ASSERT_EQUAL(cudaSuccess, err);
+
   ASSERT_EQUAL(v2[0], 0);
   ASSERT_EQUAL(v2[1], 1);
   ASSERT_EQUAL(v2[2], 2);
@@ -113,12 +119,12 @@ void TestUninitializedCopyNDeviceDevice()
   TestUninitializedCopyNDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedCopyNDeviceDevice);
+#endif
 
 
 void TestUninitializedCopyNCudaStreams()
 {
   typedef thrust::device_vector<int> Vector;
-  typedef typename Vector::value_type T;
   
   Vector v1(5);
   v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
diff --git a/testing/cuda/uninitialized_copy.mk b/testing/cuda/uninitialized_copy.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/uninitialized_copy.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/uninitialized_fill.cu b/testing/cuda/uninitialized_fill.cu
similarity index 86%
rename from testing/backend/cuda/uninitialized_fill.cu
rename to testing/cuda/uninitialized_fill.cu
index 4095f7cbc..bb222cf02 100644
--- a/testing/backend/cuda/uninitialized_fill.cu
+++ b/testing/cuda/uninitialized_fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/execution_policy.h>
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator, typename T>
 __global__
 void uninitialized_fill_kernel(ExecutionPolicy exec, Iterator first, Iterator last, T val)
@@ -23,6 +24,10 @@ void TestUninitializedFillDevice(ExecutionPolicy exec)
   T exemplar(7);
   
   uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 1, v.begin() + 4, exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], 0);
   ASSERT_EQUAL(v[1], exemplar);
@@ -33,6 +38,10 @@ void TestUninitializedFillDevice(ExecutionPolicy exec)
   exemplar = 8;
   
   uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 0, v.begin() + 3, exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], exemplar);
   ASSERT_EQUAL(v[1], exemplar);
@@ -43,6 +52,10 @@ void TestUninitializedFillDevice(ExecutionPolicy exec)
   exemplar = 9;
   
   uninitialized_fill_kernel<<<1,1>>>(exec, v.begin() + 2, v.end(), exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], 8);
   ASSERT_EQUAL(v[1], 8);
@@ -53,6 +66,10 @@ void TestUninitializedFillDevice(ExecutionPolicy exec)
   exemplar = 1;
   
   uninitialized_fill_kernel<<<1,1>>>(exec, v.begin(), v.end(), exemplar);
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
   
   ASSERT_EQUAL(v[0], exemplar);
   ASSERT_EQUAL(v[1], exemplar);
@@ -74,6 +91,7 @@ void TestUninitializedFillDeviceDevice()
   TestUninitializedFillDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedFillDeviceDevice);
+#endif
 
 
 void TestUninitializedFillCudaStreams()
@@ -103,6 +121,7 @@ void TestUninitializedFillCudaStreams()
 DECLARE_UNITTEST(TestUninitializedFillCudaStreams);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Size, typename T, typename Iterator2>
 __global__
 void uninitialized_fill_n_kernel(ExecutionPolicy exec, Iterator1 first, Size n, T val, Iterator2 result)
@@ -125,6 +144,11 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec)
   thrust::device_vector<Vector::iterator> iter_vec(1);
   
   uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 1, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   Vector::iterator iter = iter_vec[0];
   
   ASSERT_EQUAL(v[0], 0);
@@ -137,6 +161,11 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec)
   exemplar = 8;
   
   uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 0, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   
   ASSERT_EQUAL(v[0], exemplar);
@@ -149,6 +178,11 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec)
   exemplar = 9;
   
   uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin() + 2, 3, exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   
   ASSERT_EQUAL(v[0], 8);
@@ -161,6 +195,11 @@ void TestUninitializedFillNDevice(ExecutionPolicy exec)
   exemplar = 1;
   
   uninitialized_fill_n_kernel<<<1,1>>>(exec, v.begin(), v.size(), exemplar, iter_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   iter = iter_vec[0];
   
   ASSERT_EQUAL(v[0], exemplar);
@@ -184,6 +223,7 @@ void TestUninitializedFillNDeviceDevice()
   TestUninitializedFillNDevice(thrust::device);
 }
 DECLARE_UNITTEST(TestUninitializedFillNDeviceDevice);
+#endif
 
 
 void TestUninitializedFillNCudaStreams()
diff --git a/testing/cuda/uninitialized_fill.mk b/testing/cuda/uninitialized_fill.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/uninitialized_fill.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/unique.cu b/testing/cuda/unique.cu
similarity index 55%
rename from testing/backend/cuda/unique.cu
rename to testing/cuda/unique.cu
index 0c7314ee2..136ba76fd 100644
--- a/testing/backend/cuda/unique.cu
+++ b/testing/cuda/unique.cu
@@ -3,6 +3,15 @@
 #include <thrust/execution_policy.h>
 
 
+template<typename T>
+struct is_equal_div_10_unique
+{
+  __host__ __device__
+  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
+};
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
 __global__
 void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
@@ -19,14 +28,6 @@ void unique_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Binary
 }
 
 
-template<typename T>
-struct is_equal_div_10_unique
-{
-  __host__ __device__
-  bool operator()(const T x, const T& y) const { return ((int) x / 10) == ((int) y / 10); }
-};
-
-
 template<typename ExecutionPolicy>
 void TestUniqueDevice(ExecutionPolicy exec)
 {
@@ -49,6 +50,11 @@ void TestUniqueDevice(ExecutionPolicy exec)
   Vector::iterator new_last;
   
   unique_kernel<<<1,1>>>(exec, data.begin(), data.end(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last - data.begin(), 7);
@@ -61,6 +67,11 @@ void TestUniqueDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(data[6], 37);
 
   unique_kernel<<<1,1>>>(exec, data.begin(), new_last, is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -84,7 +95,16 @@ void TestUniqueDeviceDevice()
 DECLARE_UNITTEST(TestUniqueDeviceDevice);
 
 
-void TestUniqueCudaStreams()
+void TestUniqueDeviceNoSync()
+{
+  TestUniqueDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -106,8 +126,10 @@ void TestUniqueCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), data.end());
+  new_last = thrust::unique(streampolicy, data.begin(), data.end());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 7);
@@ -119,7 +141,7 @@ void TestUniqueCudaStreams()
   ASSERT_EQUAL(data[5], 31);
   ASSERT_EQUAL(data[6], 37);
 
-  new_last = thrust::unique(thrust::cuda::par.on(s), data.begin(), new_last, is_equal_div_10_unique<T>());
+  new_last = thrust::unique(streampolicy, data.begin(), new_last, is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -129,9 +151,22 @@ void TestUniqueCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCudaStreams);
+
+void TestUniqueCudaStreamsSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsSync);
+
+
+void TestUniqueCudaStreamsNoSync()
+{
+  TestUniqueCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCudaStreamsNoSync);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void unique_copy_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result1, Iterator3 result2)
@@ -172,6 +207,11 @@ void TestUniqueCopyDevice(ExecutionPolicy exec)
   Vector::iterator new_last;
   
   unique_copy_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last - output.begin(), 7);
@@ -184,6 +224,11 @@ void TestUniqueCopyDevice(ExecutionPolicy exec)
   ASSERT_EQUAL(output[6], 37);
 
   unique_copy_kernel<<<1,1>>>(exec, output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -207,7 +252,16 @@ void TestUniqueCopyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueCopyDeviceDevice);
 
 
-void TestUniqueCopyCudaStreams()
+void TestUniqueCopyDeviceNoSync()
+{
+  TestUniqueCopyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -231,8 +285,10 @@ void TestUniqueCopyCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), data.begin(), data.end(), output.begin());
+  new_last = thrust::unique_copy(streampolicy, data.begin(), data.end(), output.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - output.begin(), 7);
@@ -244,7 +300,7 @@ void TestUniqueCopyCudaStreams()
   ASSERT_EQUAL(output[5], 31);
   ASSERT_EQUAL(output[6], 37);
 
-  new_last = thrust::unique_copy(thrust::cuda::par.on(s), output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_copy(streampolicy, output.begin(), new_last, data.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last - data.begin(), 3);
@@ -254,5 +310,144 @@ void TestUniqueCopyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyCudaStreams);
+
+void TestUniqueCopyCudaStreamsSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsSync);
+
+
+void TestUniqueCopyCudaStreamsNoSync()
+{
+  TestUniqueCopyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyCudaStreamsNoSync);
+
+
+#ifdef THRUST_TEST_DEVICE_SIDE
+template<typename ExecutionPolicy, typename Iterator1, typename Iterator2>
+__global__
+void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, Iterator2 result)
+{
+  *result = thrust::unique_count(exec, first, last);
+}
+
+
+template<typename ExecutionPolicy, typename Iterator1, typename BinaryPredicate, typename Iterator2>
+__global__
+void unique_count_kernel(ExecutionPolicy exec, Iterator1 first, Iterator1 last, BinaryPredicate pred, Iterator2 result)
+{
+  *result = thrust::unique_count(exec, first, last, pred);
+}
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCountDevice(ExecutionPolicy exec)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37; 
+  
+  Vector output(1, -1);
+  
+  unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(output[0], 7);
+
+  unique_count_kernel<<<1,1>>>(exec, data.begin(), data.end(), is_equal_div_10_unique<T>(), output.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
+  ASSERT_EQUAL(output[0], 3);
+}
+
+
+void TestUniqueCountDeviceSeq()
+{
+  TestUniqueCountDevice(thrust::seq);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceSeq);
+
+
+void TestUniqueCountDeviceDevice()
+{
+  TestUniqueCountDevice(thrust::device);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceDevice);
+
+
+void TestUniqueCountDeviceNoSync()
+{
+  TestUniqueCountDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCountDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCountCudaStreams(ExecutionPolicy policy)
+{
+  typedef thrust::device_vector<int> Vector;
+  typedef Vector::value_type T;
+
+  Vector data(10);
+  data[0] = 11; 
+  data[1] = 11; 
+  data[2] = 12;
+  data[3] = 20; 
+  data[4] = 29; 
+  data[5] = 21; 
+  data[6] = 21; 
+  data[7] = 31; 
+  data[8] = 31; 
+  data[9] = 37;
+
+  cudaStream_t s;
+  cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
+  
+  int result = thrust::unique_count(streampolicy, data.begin(), data.end());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(result, 7);
+
+  result = thrust::unique_count(streampolicy, data.begin(), data.end(), is_equal_div_10_unique<T>());
+  cudaStreamSynchronize(s);
+
+  ASSERT_EQUAL(result, 3);
+
+  cudaStreamDestroy(s);
+}
+
+void TestUniqueCountCudaStreamsSync()
+{
+  TestUniqueCountCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCountCudaStreamsSync);
+
+
+void TestUniqueCountCudaStreamsNoSync()
+{
+  TestUniqueCountCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCountCudaStreamsNoSync);
 
diff --git a/testing/cuda/unique.mk b/testing/cuda/unique.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/unique.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/cuda/unique_by_key.cu b/testing/cuda/unique_by_key.cu
similarity index 79%
rename from testing/backend/cuda/unique_by_key.cu
rename to testing/cuda/unique_by_key.cu
index de7ad879e..d96cbdc6c 100644
--- a/testing/backend/cuda/unique_by_key.cu
+++ b/testing/cuda/unique_by_key.cu
@@ -44,6 +44,7 @@ void initialize_values(Vector& values)
 }
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3>
 __global__
 void unique_by_key_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 result)
@@ -77,6 +78,11 @@ void TestUniqueByKeyDevice(ExecutionPolicy exec)
   initialize_keys(keys);  initialize_values(values);
   
   unique_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
@@ -97,6 +103,11 @@ void TestUniqueByKeyDevice(ExecutionPolicy exec)
   initialize_keys(keys);  initialize_values(values);
   
   unique_by_key_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
@@ -124,7 +135,16 @@ void TestUniqueByKeyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueByKeyDeviceDevice);
 
 
-void TestUniqueByKeyCudaStreams()
+void TestUniqueByKeyDeviceNoSync()
+{
+  TestUniqueByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -132,7 +152,7 @@ void TestUniqueByKeyCudaStreams()
   Vector keys;
   Vector values;
   
-  typedef thrust::pair<typename Vector::iterator, typename Vector::iterator> iter_pair;
+  typedef thrust::pair<Vector::iterator, Vector::iterator> iter_pair;
   iter_pair new_last;
   
   // basic test
@@ -140,8 +160,10 @@ void TestUniqueByKeyCudaStreams()
 
   cudaStream_t s;
   cudaStreamCreate(&s);
+
+  auto streampolicy = policy.on(s);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin());
   cudaStreamSynchronize(s);
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   5);
@@ -161,7 +183,7 @@ void TestUniqueByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key(streampolicy, keys.begin(), keys.end(), values.begin(), is_equal_div_10_unique<T>());
   
   ASSERT_EQUAL(new_last.first  - keys.begin(),   3);
   ASSERT_EQUAL(new_last.second - values.begin(), 3);
@@ -175,9 +197,22 @@ void TestUniqueByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueByKeyCudaStreams);
+
+void TestUniqueByKeyCudaStreamsSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsSync);
+
+
+void TestUniqueByKeyCudaStreamsNoSync()
+{
+  TestUniqueByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueByKeyCudaStreamsNoSync);
 
 
+#ifdef THRUST_TEST_DEVICE_SIDE
 template<typename ExecutionPolicy, typename Iterator1, typename Iterator2, typename Iterator3, typename Iterator4, typename Iterator5>
 __global__
 void unique_by_key_copy_kernel(ExecutionPolicy exec, Iterator1 keys_first, Iterator1 keys_last, Iterator2 values_first, Iterator3 keys_result, Iterator4 values_result, Iterator5 result)
@@ -214,6 +249,11 @@ void TestUniqueCopyByKeyDevice(ExecutionPolicy exec)
   Vector output_values(values.size());
 
   unique_by_key_copy_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -234,6 +274,11 @@ void TestUniqueCopyByKeyDevice(ExecutionPolicy exec)
   initialize_keys(keys);  initialize_values(values);
   
   unique_by_key_copy_kernel<<<1,1>>>(exec, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>(), new_last_vec.begin());
+  {
+    cudaError_t const err = cudaDeviceSynchronize();
+    ASSERT_EQUAL(cudaSuccess, err);
+  }
+
   new_last = new_last_vec[0];
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
@@ -262,7 +307,16 @@ void TestUniqueCopyByKeyDeviceDevice()
 DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceDevice);
 
 
-void TestUniqueCopyByKeyCudaStreams()
+void TestUniqueCopyByKeyDeviceNoSync()
+{
+  TestUniqueCopyByKeyDevice(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyDeviceNoSync);
+#endif
+
+
+template<typename ExecutionPolicy>
+void TestUniqueCopyByKeyCudaStreams(ExecutionPolicy policy)
 {
   typedef thrust::device_vector<int> Vector;
   typedef Vector::value_type T;
@@ -270,7 +324,7 @@ void TestUniqueCopyByKeyCudaStreams()
   Vector keys;
   Vector values;
 
-  typedef thrust::pair<typename Vector::iterator, typename Vector::iterator> iter_pair;
+  typedef thrust::pair<Vector::iterator, Vector::iterator> iter_pair;
   iter_pair new_last;
 
   // basic test
@@ -282,7 +336,9 @@ void TestUniqueCopyByKeyCudaStreams()
   cudaStream_t s;
   cudaStreamCreate(&s);
 
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
+  auto streampolicy = policy.on(s);
+
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   5);
@@ -302,7 +358,7 @@ void TestUniqueCopyByKeyCudaStreams()
   // test BinaryPredicate
   initialize_keys(keys);  initialize_values(values);
   
-  new_last = thrust::unique_by_key_copy(thrust::cuda::par.on(s), keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
+  new_last = thrust::unique_by_key_copy(streampolicy, keys.begin(), keys.end(), values.begin(), output_keys.begin(), output_values.begin(), is_equal_div_10_unique<T>());
   cudaStreamSynchronize(s);
 
   ASSERT_EQUAL(new_last.first  - output_keys.begin(),   3);
@@ -317,5 +373,17 @@ void TestUniqueCopyByKeyCudaStreams()
 
   cudaStreamDestroy(s);
 }
-DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreams);
+
+void TestUniqueCopyByKeyCudaStreamsSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsSync);
+
+
+void TestUniqueCopyByKeyCudaStreamsNoSync()
+{
+  TestUniqueCopyByKeyCudaStreams(thrust::cuda::par_nosync);
+}
+DECLARE_UNITTEST(TestUniqueCopyByKeyCudaStreamsNoSync);
 
diff --git a/testing/cuda/unique_by_key.mk b/testing/cuda/unique_by_key.mk
new file mode 100644
index 000000000..7d930481e
--- /dev/null
+++ b/testing/cuda/unique_by_key.mk
@@ -0,0 +1 @@
+CUDACC_FLAGS += -rdc=true
diff --git a/testing/backend/decompose.cu b/testing/decompose.cu
similarity index 100%
rename from testing/backend/decompose.cu
rename to testing/decompose.cu
diff --git a/testing/dependencies_aware_policies.cu b/testing/dependencies_aware_policies.cu
new file mode 100644
index 000000000..531339215
--- /dev/null
+++ b/testing/dependencies_aware_policies.cu
@@ -0,0 +1,189 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/seq.h>
+#include <thrust/system/cpp/detail/par.h>
+#include <thrust/system/omp/detail/par.h>
+#include <thrust/system/tbb/detail/par.h>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#  include <thrust/system/cuda/detail/par.h>
+#endif
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename T>
+struct test_allocator_t
+{
+};
+
+test_allocator_t<int> test_allocator = test_allocator_t<int>();
+
+template<int I>
+struct test_dependency_t
+{
+};
+
+template<int I>
+test_dependency_t<I> test_dependency()
+{
+    return {};
+}
+
+template<typename Policy, template<typename> class CRTPBase>
+struct policy_info
+{
+    using policy = Policy;
+
+    template<template<template<typename> class, typename...> class Template, typename ...Arguments>
+    using apply_base_first = Template<CRTPBase, Arguments...>;
+
+    template<template<typename, template<typename> class, typename...> class Template, typename First, typename ...Arguments>
+    using apply_base_second = Template<First, CRTPBase, Arguments...>;
+};
+
+template<typename PolicyInfo>
+struct TestDependencyAttachment
+{
+    template<typename ...Expected, typename T>
+    static void assert_correct(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_first<
+                    thrust::detail::execute_with_dependencies,
+                    Expected...
+                >
+            >::value), true);
+    }
+
+    template<typename Allocator, typename ...Expected, typename T>
+    static void assert_correct_with_allocator(T)
+    {
+        ASSERT_EQUAL(
+            (thrust::detail::is_same<
+                T,
+                typename PolicyInfo::template apply_base_second<
+                    thrust::detail::execute_with_allocator_and_dependencies,
+                    Allocator,
+                    Expected...
+                >
+            >::value), true);
+    }
+
+    void operator()()
+    {
+        typename PolicyInfo::policy policy;
+
+        assert_correct<
+            test_dependency_t<1>
+        >(policy
+            .after(
+                test_dependency<1>()
+            )
+        );
+
+        assert_correct<
+            test_dependency_t<1>,
+            test_dependency_t<2>
+        >(policy
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>()
+            )
+        );
+
+        assert_correct<
+            test_dependency_t<1>,
+            test_dependency_t<2>,
+            test_dependency_t<3>
+        >(policy
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>(),
+                test_dependency<3>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>,
+            test_dependency_t<2>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>()
+            )
+        );
+
+        assert_correct_with_allocator<
+            test_allocator_t<int> &,
+            test_dependency_t<1>,
+            test_dependency_t<2>,
+            test_dependency_t<3>
+        >(policy(test_allocator)
+            .after(
+                test_dependency<1>(),
+                test_dependency<2>(),
+                test_dependency<3>()
+            )
+        );
+    }
+};
+
+typedef policy_info<
+    thrust::detail::seq_t,
+    thrust::system::detail::sequential::execution_policy
+> sequential_info;
+typedef policy_info<
+    thrust::system::cpp::detail::par_t,
+    thrust::system::cpp::detail::execution_policy
+> cpp_par_info;
+typedef policy_info<
+    thrust::system::omp::detail::par_t,
+    thrust::system::omp::detail::execution_policy
+> omp_par_info;
+typedef policy_info<
+    thrust::system::tbb::detail::par_t,
+    thrust::system::tbb::detail::execution_policy
+> tbb_par_info;
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+typedef policy_info<
+    thrust::system::cuda::detail::par_t,
+    thrust::cuda_cub::execute_on_stream_base
+> cuda_par_info;
+#endif
+
+SimpleUnitTest<
+    TestDependencyAttachment,
+    unittest::type_list<
+        // TODO: uncomment when dependencies are generalized to all backends
+        // sequential_info,
+        // cpp_par_info,
+        // omp_par_info,
+        // tbb_par_info,
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+        cuda_par_info
+#endif
+    >
+> TestDependencyAttachmentInstance;
+
+#else // C++11
+
+void TestDummy()
+{
+}
+DECLARE_UNITTEST(TestDummy);
+
+#endif // C++11
diff --git a/testing/dereference.cu b/testing/dereference.cu
index 511f02842..ef5a991ef 100644
--- a/testing/dereference.cu
+++ b/testing/dereference.cu
@@ -7,7 +7,7 @@
 #include <thrust/iterator/counting_iterator.h>
 
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 
 template <typename Iterator1, typename Iterator2>
@@ -106,4 +106,4 @@ void TestDeviceDereferenceTransformedCountingIterator(void)
 }
 DECLARE_UNITTEST(TestDeviceDereferenceTransformedCountingIterator);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/device_delete.cu b/testing/device_delete.cu
index b32d4b27b..12f757fa4 100644
--- a/testing/device_delete.cu
+++ b/testing/device_delete.cu
@@ -4,26 +4,29 @@
 #include <thrust/device_new.h>
 #include <thrust/device_delete.h>
 
+#include <nv/target>
+
 struct Foo
 {
   __host__ __device__
   Foo(void)
-    :set_me_upon_destruction(0)
+    : set_me_upon_destruction{nullptr}
   {}
 
   __host__ __device__
   ~Foo(void)
   {
-#ifdef __CUDA_ARCH__
-    // __device__ overload
-    if(set_me_upon_destruction != 0)
-      *set_me_upon_destruction = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      if (set_me_upon_destruction != nullptr)
+      {
+        *set_me_upon_destruction = true;
+      }));
   }
 
   bool *set_me_upon_destruction;
 };
 
+#if !defined(__QNX__)
 void TestDeviceDeleteDestructorInvocation(void)
 {
   KNOWN_FAILURE;
@@ -43,4 +46,4 @@ void TestDeviceDeleteDestructorInvocation(void)
 //  ASSERT_EQUAL(true, destructor_flag[0]);
 }
 DECLARE_UNITTEST(TestDeviceDeleteDestructorInvocation);
-
+#endif
diff --git a/testing/device_ptr.cu b/testing/device_ptr.cu
index ab3d5e3d1..c3e7c8bf8 100644
--- a/testing/device_ptr.cu
+++ b/testing/device_ptr.cu
@@ -4,8 +4,6 @@
 
 void TestDevicePointerManipulation(void)
 {
-    typedef int T;
-
     thrust::device_vector<int> data(5);
 
     thrust::device_ptr<int> begin(&data[0]);
@@ -93,3 +91,31 @@ void TestRawPointerCast(void)
 }
 DECLARE_VECTOR_UNITTEST(TestRawPointerCast);
 
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T>
+void TestDevicePointerNullptrCompatibility()
+{
+    thrust::device_ptr<T> p0(nullptr);
+
+    ASSERT_EQUAL_QUIET(nullptr, p0);
+    ASSERT_EQUAL_QUIET(p0, nullptr);
+
+    p0 = nullptr;
+
+    ASSERT_EQUAL_QUIET(nullptr, p0);
+    ASSERT_EQUAL_QUIET(p0, nullptr);
+}
+DECLARE_GENERIC_UNITTEST(TestDevicePointerNullptrCompatibility);
+
+template<typename T>
+void TestDevicePointerBoolConversion()
+{
+    thrust::device_ptr<T> p0(nullptr);
+    auto const b = bool(p0);
+
+    ASSERT_EQUAL_QUIET(false, b);
+}
+DECLARE_GENERIC_UNITTEST(TestDevicePointerBoolConversion);
+#endif
+
diff --git a/testing/device_reference.cu b/testing/device_reference.cu
index 3ba7ba527..c30934d75 100644
--- a/testing/device_reference.cu
+++ b/testing/device_reference.cu
@@ -206,3 +206,26 @@ void TestDeviceReferenceManipulation(void)
 }
 DECLARE_UNITTEST(TestDeviceReferenceManipulation);
 
+void TestDeviceReferenceSwap(void)
+{
+  typedef int T;
+
+  thrust::device_vector<T> v(2);
+  thrust::device_reference<T> ref1 = v.front();
+  thrust::device_reference<T> ref2 = v.back();
+
+  ref1 = 7;
+  ref2 = 13;
+
+  // test thrust::swap()
+  thrust::swap(ref1, ref2);
+  ASSERT_EQUAL(13, ref1);
+  ASSERT_EQUAL(7, ref2);
+
+  // test .swap()
+  ref1.swap(ref2);
+  ASSERT_EQUAL(7, ref1);
+  ASSERT_EQUAL(13, ref2);
+}
+DECLARE_UNITTEST(TestDeviceReferenceSwap);
+
diff --git a/testing/distance.cu b/testing/distance.cu
index 6e179e496..93e8abbf0 100644
--- a/testing/distance.cu
+++ b/testing/distance.cu
@@ -6,7 +6,6 @@
 template <typename Vector>
 void TestDistance(void)
 {
-    typedef typename Vector::value_type T;
     typedef typename Vector::iterator Iterator;
 
     Vector v(100);
diff --git a/testing/docs/doxybook_test.h b/testing/docs/doxybook_test.h
new file mode 100644
index 000000000..244648ee1
--- /dev/null
+++ b/testing/docs/doxybook_test.h
@@ -0,0 +1,222 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Test case for Doxybook rendering.
+ */
+
+#pragma once
+
+namespace thrust
+{
+
+/*! \addtogroup test Test
+ *  \{
+ */
+
+/*! \brief \c test_predefined_friend_struct is a class intended to exercise and
+ *  test Doxybook rendering.
+ */
+template <typename... Z>
+struct test_predefined_friend_struct {};
+
+/*! \brief \c test_predefined_friend_function is a function intended to
+ *  exercise and test Doxybook rendering.
+ */
+template <typename Z>
+void test_predefined_friend_function();
+
+/*! \brief \c test_class is a class intended to exercise and test Doxybook
+ *  rendering.
+ *
+ *  It does many things.
+ *
+ *  \tparam T A template parameter.
+ *  \tparam U Another template parameter.
+ *
+ *  \see test_function
+ */
+template <typename T, typename U>
+class test_class
+{
+public:
+  template <typename Z>
+  struct test_nested_struct {};
+
+  int test_member_variable = 0; ///< A test member variable.
+
+  [[deprecated]] static constexpr int test_member_constant = 42; ///< A test member constant.
+
+  template <typename X, typename Y>
+  using test_type_alias = test_class<X, Y>;
+
+  enum class test_enum_class {
+    A = 15, ///< An enumerator. It is equal to 15.
+    B,
+    C
+  };
+
+  /*! \brief Construct an empty test class.
+   */
+  test_class() = default;
+
+  /*! \brief Construct a test class.
+   */
+  __host__ __device__ constexpr
+  test_class(int);
+
+  /*! \brief \c test_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__ constexpr
+  int test_member_function() = 0;
+
+  /*! \brief \c test_virtual_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__
+  virtual int test_virtual_member_function() = 0;
+
+  /*! \brief \c test_parameter_overflow_member_function is a function intended
+   *  to test Doxybook's rendering of function and template parameters that exceed
+   *  the length of a line.
+   */
+  template <typename A = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+            typename B = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+            typename C = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>>
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int>
+  test_parameter_overflow_member_function(test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> a,
+                                          test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> b,
+                                          test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int> c);
+
+  template <typename Z>
+  friend void test_friend_function() {}
+
+  template <typename Z>
+  friend void test_predefined_friend_function();
+
+  template <typename... Z>
+  friend struct thrust::test_predefined_friend_struct;
+
+protected:
+
+  template <typename Z>
+  class test_protected_nested_class {};
+
+  /*! \brief \c test_protected_member_function is a function intended to
+   *  exercise and test Doxybook rendering.
+   */
+  __device__
+  auto test_protected_member_function();
+};
+
+/*! \brief \c test_derived_class is a derived class intended to exercise and
+ *  test Doxybook rendering.
+ */
+class test_derived_class : test_class<int, double>
+{
+  template <typename Z>
+  struct test_derived_nested_struct {};
+
+  double test_derived_member_variable = 3.14; ///< A test member variable.
+
+  typedef double test_typedef;
+
+  /*! \brief \c test_derived_member_function is a function intended to exercise
+   *  and test Doxybook rendering.
+   */
+  __host__ __device__ constexpr
+  double test_derived_member_function(int, int);
+};
+
+/*! \brief \c test_function is a function intended to exercise and test Doxybook
+ *  rendering.
+ *
+ *  \tparam T A template parameter.
+ *
+ *  \param a A function parameter.
+ *  \param b A function parameter.
+ */
+template <typename T>
+void test_function(T const& a, test_class<T, T const>&& b);
+
+/*! \brief \c test_parameter_overflow_function is a function intended to test
+ *  Doxybook's rendering of function and template parameters that exceed the
+ *  length of a line.
+ */
+template <typename T = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+  typename U = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>,
+  typename V = test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int>
+>
+test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int, int, int, int>
+test_parameter_overflow_function(test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> t,
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> u,
+  test_predefined_friend_struct<int, int, int, int, int, int, int, int, int, int, int, int> v);
+
+/*! \brief \c test_enum is an enum namespace intended to exercise and test
+ *  Doxybook rendering.
+ */
+enum class test_enum {
+  X = 1, ///< An enumerator. It is equal to 1.
+  Y = X,
+  Z = 2
+};
+
+/*! \brief \c test_alias is a type alias intended to exercise and test Doxybook
+ * rendering.
+ */
+using test_alias = test_class<int, double>;
+
+/*! \brief \c test_namespace is a namespace intended to exercise and test
+ *  Doxybook rendering.
+ */
+namespace test_namespace {
+
+inline constexpr int test_constant = 12;
+
+/*! \brief \c nested_function is a function intended to exercise and test
+ *  Doxybook rendering.
+ */
+template <typename T, typename U>
+auto test_nested_function(T t, U u) noexcept(noexcept(t + u)) -> decltype(t + u)
+{ return t + u; }
+
+/*! \brief \c test_struct is a struct intended to exercise and test Doxybook
+ *  rendering.
+ */
+template <typename Z>
+struct test_struct
+{
+  test_struct& operator=(test_struct const&) = default;
+
+  /*! \brief \c operator< is a function intended to exercise and test Doxybook
+   *  rendering.
+   */
+  bool operator<(test_struct const& t);
+};
+
+} // namespace test_namespace
+
+/*! \brief \c THRUST_TEST_MACRO is a macro intended to exercise and test
+ *  Doxybook rendering.
+ */
+#define THRUST_TEST_MACRO(x, y) thrust::test_namespace::nested_function(x, y)
+
+/*! \} // test
+ */
+
+} // namespace thrust
+
diff --git a/testing/equal.cu b/testing/equal.cu
index 744fa5373..ca9f7eb69 100644
--- a/testing/equal.cu
+++ b/testing/equal.cu
@@ -2,6 +2,8 @@
 #include <thrust/equal.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 template <class Vector>
 void TestEqualSimple(void)
@@ -62,7 +64,7 @@ void TestEqual(const size_t n)
 DECLARE_VARIABLE_UNITTEST(TestEqual);
 
 template<typename InputIterator1, typename InputIterator2>
-bool equal(my_system &system, InputIterator1 first, InputIterator1, InputIterator2)
+bool equal(my_system &system, InputIterator1 /*first*/, InputIterator1, InputIterator2)
 {
     system.validate_dispatch();
     return false;
@@ -102,3 +104,48 @@ void TestEqualDispatchImplicit()
 }
 DECLARE_UNITTEST(TestEqualDispatchImplicit);
 
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    bool operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestEqualWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
+
+    ASSERT_EQUAL(thrust::equal(thrust::device, begin, end, begin, fn), true);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestEqualWithBigIndexes()
+{
+    TestEqualWithBigIndexesHelper(30);
+    TestEqualWithBigIndexesHelper(31);
+    TestEqualWithBigIndexesHelper(32);
+    TestEqualWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestEqualWithBigIndexes);
diff --git a/testing/event.cu b/testing/event.cu
new file mode 100644
index 000000000..581426919
--- /dev/null
+++ b/testing/event.cu
@@ -0,0 +1,180 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/event.h>
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_default_constructed()
+{
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::event<decltype(thrust::device)>
+    , thrust::unique_eager_event<decltype(thrust::device)>
+    >::value)
+  );
+
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::event<decltype(thrust::device)>
+    , thrust::device_event
+    >::value)
+  );
+
+  THRUST_STATIC_ASSERT(
+    (std::is_same<
+      thrust::device_event
+    , thrust::device_unique_eager_event
+    >::value)
+  );
+
+  thrust::device_event e0;
+
+  ASSERT_EQUAL(false, e0.valid_stream());
+
+  ASSERT_THROWS_EQUAL(
+    e0.wait()
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_state)
+  );
+
+  ASSERT_THROWS_EQUAL(
+    e0.stream()
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_state)
+  );
+}
+DECLARE_UNITTEST(test_event_default_constructed);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_new_stream()
+{
+  auto e0 = thrust::device_event(thrust::new_stream);
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0.stream().native_handle());    
+
+  e0.wait();
+
+  ASSERT_EQUAL(true, e0.ready());
+}
+DECLARE_UNITTEST(test_event_new_stream);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_linear_chaining()
+{
+  constexpr std::int64_t n = 1024;
+
+  // Create a new stream.
+  auto e0 = thrust::when_all();
+
+  auto const e0_stream = e0.stream().native_handle();
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0_stream);
+
+  thrust::device_event e1;
+
+  for (std::int64_t i = 0; i < n; ++i)
+  {
+    ASSERT_EQUAL(true,  e0.valid_stream());
+
+    ASSERT_EQUAL(false, e1.valid_stream());
+    ASSERT_EQUAL(false, e1.ready());
+
+    ASSERT_EQUAL_QUIET(e0_stream, e0.stream().native_handle());
+
+    e1 = thrust::when_all(e0);
+
+    ASSERT_EQUAL(false, e0.valid_stream());
+    ASSERT_EQUAL(false, e0.ready());
+
+    ASSERT_EQUAL(true,  e1.valid_stream());
+
+    ASSERT_EQUAL(e0_stream, e1.stream().native_handle());
+
+    std::swap(e0, e1);
+  }
+}
+DECLARE_UNITTEST(test_event_linear_chaining);
+
+///////////////////////////////////////////////////////////////////////////////
+
+__host__
+void test_event_when_all()
+{
+  // Create events with new streams.
+  auto e0 = thrust::when_all();
+  auto e1 = thrust::when_all();
+  auto e2 = thrust::when_all();
+  auto e3 = thrust::when_all();
+  auto e4 = thrust::when_all();
+  auto e5 = thrust::when_all();
+  auto e6 = thrust::when_all();
+  auto e7 = thrust::when_all();
+
+  auto const e0_stream = e0.stream().native_handle();
+
+  ASSERT_EQUAL(true, e0.valid_stream());
+  ASSERT_EQUAL(true, e1.valid_stream());
+  ASSERT_EQUAL(true, e2.valid_stream());
+  ASSERT_EQUAL(true, e3.valid_stream());
+  ASSERT_EQUAL(true, e4.valid_stream());
+  ASSERT_EQUAL(true, e5.valid_stream());
+  ASSERT_EQUAL(true, e6.valid_stream());
+  ASSERT_EQUAL(true, e7.valid_stream());
+
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e0_stream);
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e1.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e2.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e3.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e4.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e5.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e6.stream().native_handle());
+  ASSERT_NOT_EQUAL_QUIET(nullptr, e7.stream().native_handle());
+
+  auto e8 = thrust::when_all(e0, e1, e2, e3, e4, e5, e6, e7);
+
+  ASSERT_EQUAL(false, e0.valid_stream());
+  ASSERT_EQUAL(false, e1.valid_stream());
+  ASSERT_EQUAL(false, e2.valid_stream());
+  ASSERT_EQUAL(false, e3.valid_stream());
+  ASSERT_EQUAL(false, e4.valid_stream());
+  ASSERT_EQUAL(false, e5.valid_stream());
+  ASSERT_EQUAL(false, e6.valid_stream());
+  ASSERT_EQUAL(false, e7.valid_stream());
+
+  ASSERT_EQUAL(true, e8.valid_stream());
+
+  ASSERT_EQUAL(e0_stream, e8.stream().native_handle());
+
+  e8.wait();
+
+  ASSERT_EQUAL(false, e0.ready());
+  ASSERT_EQUAL(false, e1.ready());
+  ASSERT_EQUAL(false, e2.ready());
+  ASSERT_EQUAL(false, e3.ready());
+  ASSERT_EQUAL(false, e4.ready());
+  ASSERT_EQUAL(false, e5.ready());
+  ASSERT_EQUAL(false, e6.ready());
+  ASSERT_EQUAL(false, e7.ready());
+
+  ASSERT_EQUAL(true,  e8.ready());
+}
+DECLARE_UNITTEST(test_event_when_all);
+
+///////////////////////////////////////////////////////////////////////////////
+ 
+#endif
+
diff --git a/testing/fill.cu b/testing/fill.cu
index 6cb8a8a38..7154b4118 100644
--- a/testing/fill.cu
+++ b/testing/fill.cu
@@ -5,7 +5,7 @@
 #include <thrust/iterator/retag.h>
 #include <algorithm>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 template <class Vector>
 void TestFillSimple(void)
@@ -22,17 +22,17 @@ void TestFillSimple(void)
     ASSERT_EQUAL(v[2], 7);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
-    
+
     thrust::fill(v.begin() + 0, v.begin() + 3, (T) 8);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 8);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
-    
+
     thrust::fill(v.begin() + 2, v.end(), (T) 9);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 9);
@@ -40,7 +40,7 @@ void TestFillSimple(void)
     ASSERT_EQUAL(v[4], 9);
 
     thrust::fill(v.begin(), v.end(), (T) 1);
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
@@ -67,19 +67,17 @@ DECLARE_UNITTEST(TestFillDiscardIterator);
 template <class Vector>
 void TestFillMixedTypes(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(4);
 
-    thrust::fill(v.begin(), v.end(), (long) 10);
-    
-    ASSERT_EQUAL(v[0], 10);
-    ASSERT_EQUAL(v[1], 10);
-    ASSERT_EQUAL(v[2], 10);
-    ASSERT_EQUAL(v[3], 10);
-    
-    thrust::fill(v.begin(), v.end(), (float) 20);
-    
+    thrust::fill(v.begin(), v.end(), bool(true));
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
+
+    thrust::fill(v.begin(), v.end(), char(20));
+
     ASSERT_EQUAL(v[0], 20);
     ASSERT_EQUAL(v[1], 20);
     ASSERT_EQUAL(v[2], 20);
@@ -103,17 +101,17 @@ void TestFill(size_t n)
     thrust::fill(d_data.begin() + std::min((size_t)117, n), d_data.begin() + std::min((size_t)367, n), (T) 1);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin() + std::min((size_t)8, n), h_data.begin() + std::min((size_t)259, n), (T) 2);
     thrust::fill(d_data.begin() + std::min((size_t)8, n), d_data.begin() + std::min((size_t)259, n), (T) 2);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin() + std::min((size_t)3, n), h_data.end(), (T) 3);
     thrust::fill(d_data.begin() + std::min((size_t)3, n), d_data.end(), (T) 3);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill(h_data.begin(), h_data.end(), (T) 4);
     thrust::fill(d_data.begin(), d_data.end(), (T) 4);
 
@@ -137,18 +135,18 @@ void TestFillNSimple(void)
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
     ASSERT_EQUAL_QUIET(v.begin() + 4, iter);
-    
+
     iter = thrust::fill_n(v.begin() + 0, 3, (T) 8);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 8);
     ASSERT_EQUAL(v[3], 7);
     ASSERT_EQUAL(v[4], 4);
     ASSERT_EQUAL_QUIET(v.begin() + 3, iter);
-    
+
     iter = thrust::fill_n(v.begin() + 2, 3, (T) 9);
-    
+
     ASSERT_EQUAL(v[0], 8);
     ASSERT_EQUAL(v[1], 8);
     ASSERT_EQUAL(v[2], 9);
@@ -157,7 +155,7 @@ void TestFillNSimple(void)
     ASSERT_EQUAL_QUIET(v.end(), iter);
 
     iter = thrust::fill_n(v.begin(), v.size(), (T) 1);
-    
+
     ASSERT_EQUAL(v[0], 1);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 1);
@@ -191,20 +189,18 @@ DECLARE_UNITTEST(TestFillNDiscardIterator);
 template <class Vector>
 void TestFillNMixedTypes(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(4);
 
-    typename Vector::iterator iter = thrust::fill_n(v.begin(), v.size(), (long) 10);
-    
-    ASSERT_EQUAL(v[0], 10);
-    ASSERT_EQUAL(v[1], 10);
-    ASSERT_EQUAL(v[2], 10);
-    ASSERT_EQUAL(v[3], 10);
+    typename Vector::iterator iter = thrust::fill_n(v.begin(), v.size(), bool(true));
+
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 1);
+    ASSERT_EQUAL(v[3], 1);
     ASSERT_EQUAL_QUIET(v.end(), iter);
-    
-    iter = thrust::fill_n(v.begin(), v.size(), (float) 20);
-    
+
+    iter = thrust::fill_n(v.begin(), v.size(), char(20));
+
     ASSERT_EQUAL(v[0], 20);
     ASSERT_EQUAL(v[1], 20);
     ASSERT_EQUAL(v[2], 20);
@@ -231,19 +227,19 @@ void TestFillN(size_t n)
     thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)367, n) - begin_offset, (T) 1);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     begin_offset = std::min<size_t>(8, n);
     thrust::fill_n(h_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
     thrust::fill_n(d_data.begin() + begin_offset, std::min((size_t)259, n) - begin_offset, (T) 2);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     begin_offset = std::min<size_t>(3, n);
     thrust::fill_n(h_data.begin() + begin_offset, h_data.size() - begin_offset, (T) 3);
     thrust::fill_n(d_data.begin() + begin_offset, d_data.size() - begin_offset, (T) 3);
 
     ASSERT_EQUAL(h_data, d_data);
-    
+
     thrust::fill_n(h_data.begin(), h_data.size(), (T) 4);
     thrust::fill_n(d_data.begin(), d_data.size(), (T) 4);
 
@@ -305,7 +301,7 @@ void TestFillWithTrivialAssignment(void)
 
     thrust::host_vector<T>   h(1);
     thrust::device_vector<T> d(1);
-    
+
     ASSERT_EQUAL(h[0].x, 0);
     ASSERT_EQUAL(h[0].y, 0);
     ASSERT_EQUAL(h[0].z, 0);
@@ -338,6 +334,10 @@ struct TypeWithNonTrivialAssigment
   __host__ __device__
   TypeWithNonTrivialAssigment() : x(0), y(0), z(0) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  TypeWithNonTrivialAssigment(const TypeWithNonTrivialAssigment &) = default;
+#endif
+
   __host__ __device__
   TypeWithNonTrivialAssigment& operator=(const TypeWithNonTrivialAssigment& t)
   {
@@ -346,7 +346,7 @@ struct TypeWithNonTrivialAssigment
     z = t.x + t.y;
     return *this;
   }
-  
+
   __host__ __device__
   bool operator==(const TypeWithNonTrivialAssigment& t) const
   {
@@ -360,7 +360,7 @@ void TestFillWithNonTrivialAssignment(void)
 
     thrust::host_vector<T>   h(1);
     thrust::device_vector<T> d(1);
-    
+
     ASSERT_EQUAL(h[0].x, 0);
     ASSERT_EQUAL(h[0].y, 0);
     ASSERT_EQUAL(h[0].z, 0);
@@ -387,7 +387,7 @@ DECLARE_UNITTEST(TestFillWithNonTrivialAssignment);
 
 
 template<typename ForwardIterator, typename T>
-void fill(my_system &system, ForwardIterator first, ForwardIterator, const T&)
+void fill(my_system &system, ForwardIterator /*first*/, ForwardIterator, const T&)
 {
     system.validate_dispatch();
 }
@@ -462,4 +462,4 @@ void TestFillNDispatchImplicit()
 DECLARE_UNITTEST(TestFillNDispatchImplicit);
 
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/find.cu b/testing/find.cu
index 898997851..988afbeef 100644
--- a/testing/find.cu
+++ b/testing/find.cu
@@ -1,4 +1,5 @@
 #include <unittest/unittest.h>
+#include <thrust/sequence.h>
 #include <thrust/find.h>
 #include <thrust/iterator/retag.h>
 
@@ -39,8 +40,6 @@ struct less_than_value_pred
 template <class Vector>
 void TestFindSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector vec(5);
     vec[0] = 1;
     vec[1] = 2;
@@ -306,3 +305,69 @@ struct TestFindIfNot
 };
 VariableUnitTest<TestFindIfNot, SignedIntegralTypes> TestFindIfNotInstance;
 
+void TestFindWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::detail::intmax_t distance_low_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            17));
+
+    thrust::detail::intmax_t distance_high_value = thrust::distance(
+        begin,
+        thrust::find(
+            thrust::device,
+            begin,
+            end,
+            (1ll << magnitude) - 17));
+
+    ASSERT_EQUAL(distance_low_value, 16);
+    ASSERT_EQUAL(distance_high_value, (1ll << magnitude) - 18);
+}
+
+void TestFindWithBigIndexes()
+{
+    TestFindWithBigIndexesHelper(30);
+    TestFindWithBigIndexesHelper(31);
+    TestFindWithBigIndexesHelper(32);
+    TestFindWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestFindWithBigIndexes);
+
+namespace
+{
+
+class Weird
+{
+  int value;
+
+public:
+  __host__ __device__ Weird(int val, int)
+      : value(val)
+  {}
+
+  friend __host__ __device__
+  bool operator==(int x, Weird y)
+  {
+    return x == y.value;
+  }
+};
+
+} // end anon namespace
+
+void TestFindAsymmetricEquality()
+{ // Regression test for NVIDIA/thrust#1229
+  thrust::host_vector<int> v(1000);
+  thrust::sequence(v.begin(), v.end());
+  thrust::device_vector<int> dv(v);
+  auto result = thrust::find(dv.begin(), dv.end(), Weird(333, 0));
+  ASSERT_EQUAL(*result, 333);
+  ASSERT_EQUAL(result - dv.begin(), 333);
+}
+DECLARE_UNITTEST(TestFindAsymmetricEquality);
diff --git a/testing/fix_clang_nvcc_11.5.h b/testing/fix_clang_nvcc_11.5.h
new file mode 100644
index 000000000..279dca3f9
--- /dev/null
+++ b/testing/fix_clang_nvcc_11.5.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#if defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 &&                       \
+    __CUDACC_VER_MINOR__ <= 5
+
+#if defined(__NVCC_DIAG_PRAGMA_SUPPORT__)
+#  pragma nv_diag_suppress 3171
+#else
+#  pragma diag_suppress 3171
+#endif
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wkeyword-compat"
+
+// Clang has a builtin called `__is_signed`. Unfortunately, libstdc++ headers
+// use this name as an identifier. Clang has a workaround for that, it checks 
+// if `__is_signed` is `const static bool` as in libstdc++ headers and if so,
+// disables the intrinsic for the rest of the TU:
+// https://github.com/llvm/llvm-project/blob/f49b6afc231242dfee027d5da69734836097cd43/clang/lib/Parse/ParseDecl.cpp#L3552-L3566
+const static bool __is_signed = false;
+
+#pragma clang diagnostic pop
+#endif // defined(__NVCC__) && defined(__clang__) && __CUDACC_VER_MAJOR__ == 11 &&
+       //   __CUDACC_VER_MINOR__ <= 5
diff --git a/testing/for_each.cu b/testing/for_each.cu
index 133b33a6f..8040e5f78 100644
--- a/testing/for_each.cu
+++ b/testing/for_each.cu
@@ -3,9 +3,11 @@
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 #include <algorithm>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 template <typename T>
 class mark_present_for_each
@@ -22,7 +24,7 @@ void TestForEachSimple(void)
 
     Vector input(5);
     Vector output(7, (T) 0);
-    
+
     input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
 
     mark_present_for_each<T> f;
@@ -39,7 +41,7 @@ void TestForEachSimple(void)
     ASSERT_EQUAL(output[6], 1);
     ASSERT_EQUAL_QUIET(result, input.end());
 }
-DECLARE_VECTOR_UNITTEST(TestForEachSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestForEachSimple);
 
 
 template<typename InputIterator, typename Function>
@@ -88,7 +90,7 @@ void TestForEachNSimple(void)
 
     Vector input(5);
     Vector output(7, (T) 0);
-    
+
     input[0] = 3; input[1] = 2; input[2] = 3; input[3] = 4; input[4] = 6;
 
     mark_present_for_each<T> f;
@@ -105,7 +107,7 @@ void TestForEachNSimple(void)
     ASSERT_EQUAL(output[6], 1);
     ASSERT_EQUAL_QUIET(result, input.end());
 }
-DECLARE_VECTOR_UNITTEST(TestForEachNSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestForEachNSimple);
 
 
 template<typename InputIterator, typename Size, typename Function>
@@ -304,7 +306,9 @@ void TestForEachWithLargeTypes(void)
     _TestForEachWithLargeTypes<int,  128>();
     _TestForEachWithLargeTypes<int,  256>();
     _TestForEachWithLargeTypes<int,  512>();
-    _TestForEachWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
+    
+    // XXX parallel_for doens't support large types 
+//    _TestForEachWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
 }
 DECLARE_UNITTEST(TestForEachWithLargeTypes);
 
@@ -343,8 +347,53 @@ void TestForEachNWithLargeTypes(void)
     _TestForEachNWithLargeTypes<int,  128>();
     _TestForEachNWithLargeTypes<int,  256>();
     _TestForEachNWithLargeTypes<int,  512>();
-    _TestForEachNWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
+
+    // XXX parallel_for doens't support large types 
+//    _TestForEachNWithLargeTypes<int, 1024>();  // fails on Vista 64 w/ VS2008
 }
 DECLARE_UNITTEST(TestForEachNWithLargeTypes);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+
+struct only_set_when_expected
+{
+    unsigned long long expected;
+    bool * flag;
+
+    __device__
+    void operator()(unsigned long long x)
+    {
+        if (x == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+void TestForEachWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<unsigned long long> begin(0);
+    thrust::counting_iterator<unsigned long long> end = begin + (1ull << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected fn = { (1ull << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::for_each(thrust::device, begin, end, fn);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestForEachWithBigIndexes()
+{
+    TestForEachWithBigIndexesHelper(30);
+    TestForEachWithBigIndexesHelper(31);
+    TestForEachWithBigIndexesHelper(32);
+    TestForEachWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestForEachWithBigIndexes);
diff --git a/testing/functional.cu b/testing/functional.cu
index c44b0a6f9..1d1a79b6c 100644
--- a/testing/functional.cu
+++ b/testing/functional.cu
@@ -5,7 +5,7 @@
 #include <functional>
 #include <algorithm>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
     
 const size_t NUM_SAMPLES = 10000;
 
@@ -294,7 +294,20 @@ void TestNot1(void)
     ASSERT_EQUAL(output[3], 0);
     ASSERT_EQUAL(output[4], 1);
 }
-DECLARE_VECTOR_UNITTEST(TestNot1);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestNot1);
+
+
+// GCC 11 fails to build this test case with a spurious error in a
+// very specific scenario:
+// - GCC 11
+// - CPP system for both host and device
+// - C++11 dialect
+#if !(defined(THRUST_GCC_VERSION) &&				\
+      THRUST_GCC_VERSION >= 110000 &&				\
+      THRUST_GCC_VERSION < 120000 &&				\
+      THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP &&		\
+      THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP &&	\
+      THRUST_CPP_DIALECT == 2011)
 
 template <class Vector>
 void TestNot2(void)
@@ -321,4 +334,6 @@ void TestNot2(void)
 }
 DECLARE_VECTOR_UNITTEST(TestNot2);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+#endif // Weird GCC11 failure case
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/functional_placeholders_arithmetic.cu b/testing/functional_placeholders_arithmetic.cu
index 50266f379..8d8535aa6 100644
--- a/testing/functional_placeholders_arithmetic.cu
+++ b/testing/functional_placeholders_arithmetic.cu
@@ -7,7 +7,7 @@
 template<typename Vector> \
   struct TestFunctionalPlaceholders##name \
 { \
-  void operator()(const size_t dummy) \
+  void operator()(const size_t) \
   { \
     static const size_t num_samples = 10000; \
     const size_t zero = 0; \
@@ -33,7 +33,7 @@ template<typename Vector> \
     ASSERT_ALMOST_EQUAL(reference, result); \
   } \
 }; \
-VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_malloc_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
 VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
 
 BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(Plus,       +, thrust::plus,       ThirtyTwoBitTypes);
@@ -65,8 +65,8 @@ template<typename T>
   struct unary_plus_reference
 {
   __host__ __device__ T operator()(const T &x) const
-  {
-    return +x;
+  { // Static cast to undo integral promotion
+    return static_cast<T>(+x);
   }
 };
 
diff --git a/testing/functional_placeholders_bitwise.cu b/testing/functional_placeholders_bitwise.cu
index 4942ebdab..7c92d967f 100644
--- a/testing/functional_placeholders_bitwise.cu
+++ b/testing/functional_placeholders_bitwise.cu
@@ -3,34 +3,45 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/constant_iterator.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::host_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U> type;
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::device_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
 {
-  typedef thrust::device_vector<U> type;
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, op, reference_functor, type_list) \
 template<typename Vector> \
   struct TestFunctionalPlaceholders##name \
 { \
-  void operator()(const size_t dummy) \
+  void operator()(const size_t) \
   { \
-    static const size_t num_samples = 10000; \
-    const size_t zero = 0; \
+    constexpr size_t NUM_SAMPLES = 10000; \
+    constexpr size_t ZERO = 0; \
     typedef typename Vector::value_type T; \
-    typedef typename rebind_vector<Vector,bool>::type bool_vector; \
-    Vector lhs = unittest::random_samples<T>(num_samples); \
-    Vector rhs = unittest::random_samples<T>(num_samples); \
+    Vector lhs = unittest::random_samples<T>(NUM_SAMPLES); \
+    Vector rhs = unittest::random_samples<T>(NUM_SAMPLES); \
     thrust::replace(rhs.begin(), rhs.end(), T(0), T(1)); \
 \
     Vector reference(lhs.size()); \
@@ -45,12 +56,12 @@ template<typename Vector> \
     thrust::transform(lhs.begin(), lhs.end(), result.begin(), _1 op T(1)); \
     ASSERT_ALMOST_EQUAL(reference, result); \
 \
-    thrust::transform(thrust::make_constant_iterator<T>(1,zero), thrust::make_constant_iterator<T>(1,num_samples), rhs.begin(), reference.begin(), reference_functor<T>()); \
+    thrust::transform(thrust::make_constant_iterator<T>(1,ZERO), thrust::make_constant_iterator<T>(1,NUM_SAMPLES), rhs.begin(), reference.begin(), reference_functor<T>()); \
     thrust::transform(rhs.begin(), rhs.end(), result.begin(), T(1) op _1); \
     ASSERT_ALMOST_EQUAL(reference, result); \
   } \
 }; \
-VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_malloc_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
 VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
 
 BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(BitAnd, &, thrust::bit_and, SmallIntegralTypes);
@@ -82,5 +93,5 @@ template<typename Vector>
 
   ASSERT_EQUAL(reference, result);
 }
-DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholdersBitNegate);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersBitNegate);
 
diff --git a/testing/functional_placeholders_compound_assignment.cu b/testing/functional_placeholders_compound_assignment.cu
index b6893673d..512fa73fa 100644
--- a/testing/functional_placeholders_compound_assignment.cu
+++ b/testing/functional_placeholders_compound_assignment.cu
@@ -7,7 +7,7 @@
 template<typename Vector> \
   struct TestFunctionalPlaceholders##name \
 { \
-  void operator()(const size_t dummy) \
+  void operator()(const size_t) \
   { \
     const size_t num_samples = 10000; \
     typedef typename Vector::value_type T; \
@@ -31,7 +31,7 @@ template<typename Vector> \
     ASSERT_ALMOST_EQUAL(lhs_reference, lhs); \
   } \
 }; \
-VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_malloc_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
+VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholders##name##DeviceInstance; \
 VectorUnitTest<TestFunctionalPlaceholders##name, type_list, thrust::host_vector, std::allocator> TestFunctionalPlaceholders##name##HostInstance;
 
 template<typename T>
@@ -161,7 +161,7 @@ template<typename Vector> \
   ASSERT_ALMOST_EQUAL(input_reference, input); \
   ASSERT_ALMOST_EQUAL(reference, result); \
 } \
-DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholdersPrefix##name);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersPrefix##name);
 
 PREFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Increment,  ++,  prefix_increment_reference);
 PREFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Decrement,  --,  prefix_decrement_reference);
@@ -185,7 +185,7 @@ template<typename Vector> \
   ASSERT_ALMOST_EQUAL(input_reference, input); \
   ASSERT_ALMOST_EQUAL(reference, result); \
 } \
-DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholdersSuffix##name);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersSuffix##name);
 
 SUFFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Increment,  ++,  suffix_increment_reference);
 SUFFIX_FUNCTIONAL_PLACEHOLDERS_TEST(Decrement,  --,  suffix_decrement_reference);
diff --git a/testing/functional_placeholders_logical.cu b/testing/functional_placeholders_logical.cu
index b6d04574e..caca82040 100644
--- a/testing/functional_placeholders_logical.cu
+++ b/testing/functional_placeholders_logical.cu
@@ -2,20 +2,32 @@
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::host_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+  typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+  typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+  typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U> type;
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::device_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
 {
-  typedef thrust::device_vector<U> type;
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
@@ -63,5 +75,5 @@ template<typename Vector>
 
   ASSERT_EQUAL(reference, result);
 }
-DECLARE_VECTOR_UNITTEST(TestFunctionalPlaceholdersLogicalNot);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestFunctionalPlaceholdersLogicalNot);
 
diff --git a/testing/functional_placeholders_miscellaneous.cu b/testing/functional_placeholders_miscellaneous.cu
index 5650a615f..d6774211b 100644
--- a/testing/functional_placeholders_miscellaneous.cu
+++ b/testing/functional_placeholders_miscellaneous.cu
@@ -20,7 +20,7 @@ template<typename T>
 template<typename Vector>
   struct TestFunctionalPlaceholdersValue
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     const size_t n = 10000;
     typedef typename Vector::value_type T;
@@ -39,13 +39,13 @@ template<typename Vector>
     ASSERT_ALMOST_EQUAL(reference, result);
   }
 };
-VectorUnitTest<TestFunctionalPlaceholdersValue, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_malloc_allocator> TestFunctionalPlaceholdersValueDevice;
+VectorUnitTest<TestFunctionalPlaceholdersValue, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholdersValueDevice;
 VectorUnitTest<TestFunctionalPlaceholdersValue, ThirtyTwoBitTypes, thrust::host_vector, std::allocator> TestFunctionalPlaceholdersValueHost;
 
 template<typename Vector>
   struct TestFunctionalPlaceholdersTransformIterator
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     const size_t n = 10000;
     typedef typename Vector::value_type T;
@@ -68,6 +68,6 @@ template<typename Vector>
     ASSERT_ALMOST_EQUAL(reference, result);
   }
 };
-VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_malloc_allocator> TestFunctionalPlaceholdersTransformIteratorInstanceDevice;
+VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::device_vector, thrust::device_allocator> TestFunctionalPlaceholdersTransformIteratorInstanceDevice;
 VectorUnitTest<TestFunctionalPlaceholdersTransformIterator, ThirtyTwoBitTypes, thrust::host_vector, std::allocator> TestFunctionalPlaceholdersTransformIteratorInstanceHost;
 
diff --git a/testing/functional_placeholders_relational.cu b/testing/functional_placeholders_relational.cu
index 5b3a794b3..7f088a1ea 100644
--- a/testing/functional_placeholders_relational.cu
+++ b/testing/functional_placeholders_relational.cu
@@ -2,20 +2,32 @@
 #include <thrust/functional.h>
 #include <thrust/transform.h>
 
+#include <thrust/detail/allocator/allocator_traits.h>
+
 static const size_t num_samples = 10000;
 
 template<typename Vector, typename U> struct rebind_vector;
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::host_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::host_vector<T, Allocator>, U>
+{
+    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
+    typedef typename alloc_traits::template rebind_alloc<U> new_alloc;
+    typedef thrust::host_vector<U, new_alloc> type;
+};
+
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::device_vector<T, Allocator>, U>
 {
-  typedef thrust::host_vector<U> type;
+  typedef thrust::device_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
-template<typename T, typename U>
-  struct rebind_vector<thrust::device_vector<T>, U>
+template<typename T, typename U, typename Allocator>
+  struct rebind_vector<thrust::universal_vector<T, Allocator>, U>
 {
-  typedef thrust::device_vector<U> type;
+  typedef thrust::universal_vector<U,
+    typename Allocator::template rebind<U>::other> type;
 };
 
 #define BINARY_FUNCTIONAL_PLACEHOLDERS_TEST(name, reference_operator, functor) \
diff --git a/testing/future.cu b/testing/future.cu
new file mode 100644
index 000000000..eb1ab582a
--- /dev/null
+++ b/testing/future.cu
@@ -0,0 +1,255 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+#include <unittest/util_async.h>
+
+#include <thrust/future.h>
+
+struct mock {};
+
+using future_value_types = unittest::type_list<
+  char
+, signed char
+, unsigned char
+, short
+, unsigned short
+, int
+, unsigned int
+, long
+, unsigned long
+, long long
+, unsigned long long
+, float
+, double
+, custom_numeric
+, float2
+, mock
+>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_default_constructed
+{
+  __host__
+  void operator()()
+  {
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::future<decltype(thrust::device), T>
+      , thrust::unique_eager_future<decltype(thrust::device), T>
+      >::value)
+    );
+
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::future<decltype(thrust::device), T>
+      , thrust::device_future<T>
+      >::value)
+    );
+
+    THRUST_STATIC_ASSERT(
+      (std::is_same<
+        thrust::device_future<T>
+      , thrust::device_unique_eager_future<T>
+      >::value)
+    );
+
+    thrust::device_future<T> f0;
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_THROWS_EQUAL(
+      f0.wait()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      f0.stream()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_state)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      f0.get()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      THRUST_UNUSED_VAR(f0.extract())
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_default_constructed
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_new_stream
+{
+  __host__
+  void operator()()
+  {
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+
+    ASSERT_EQUAL(true,  f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0.stream().native_handle());    
+
+    TEST_EVENT_WAIT(f0);
+
+    ASSERT_EQUAL(true, f0.ready());
+
+    ASSERT_THROWS_EQUAL(
+      f0.get()
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+
+    ASSERT_THROWS_EQUAL(
+      THRUST_UNUSED_VAR(f0.extract())
+    , thrust::event_error
+    , thrust::event_error(thrust::event_errc::no_content)
+    );
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_new_stream
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_convert_to_event
+{
+  __host__
+  void operator()()
+  {
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+
+    auto const f0_stream = f0.stream().native_handle();
+
+    ASSERT_EQUAL(true,  f0.valid_stream());
+    ASSERT_EQUAL(false, f0.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0_stream);
+
+    auto f1 = thrust::device_event(std::move(f0));
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(true,  f1.valid_stream());
+
+    ASSERT_EQUAL(f0_stream, f1.stream().native_handle());
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_convert_to_event
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct test_future_when_all
+{
+  __host__
+  void operator()()
+  {
+    // Create futures with new streams.
+    auto f0 = thrust::device_future<T>(thrust::new_stream);
+    auto f1 = thrust::device_future<T>(thrust::new_stream);
+    auto f2 = thrust::device_future<T>(thrust::new_stream);
+    auto f3 = thrust::device_future<T>(thrust::new_stream);
+    auto f4 = thrust::device_future<T>(thrust::new_stream);
+    auto f5 = thrust::device_future<T>(thrust::new_stream);
+    auto f6 = thrust::device_future<T>(thrust::new_stream);
+    auto f7 = thrust::device_future<T>(thrust::new_stream);
+
+    auto const f0_stream = f0.stream().native_handle();
+
+    ASSERT_EQUAL(true, f0.valid_stream());
+    ASSERT_EQUAL(true, f1.valid_stream());
+    ASSERT_EQUAL(true, f2.valid_stream());
+    ASSERT_EQUAL(true, f3.valid_stream());
+    ASSERT_EQUAL(true, f4.valid_stream());
+    ASSERT_EQUAL(true, f5.valid_stream());
+    ASSERT_EQUAL(true, f6.valid_stream());
+    ASSERT_EQUAL(true, f7.valid_stream());
+
+    ASSERT_EQUAL(false, f0.valid_content());
+    ASSERT_EQUAL(false, f1.valid_content());
+    ASSERT_EQUAL(false, f2.valid_content());
+    ASSERT_EQUAL(false, f3.valid_content());
+    ASSERT_EQUAL(false, f4.valid_content());
+    ASSERT_EQUAL(false, f5.valid_content());
+    ASSERT_EQUAL(false, f6.valid_content());
+    ASSERT_EQUAL(false, f7.valid_content());
+
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f0_stream);
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f1.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f2.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f3.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f4.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f5.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f6.stream().native_handle());
+    ASSERT_NOT_EQUAL_QUIET(nullptr, f7.stream().native_handle());
+
+    auto e0 = thrust::when_all(f0, f1, f2, f3, f4, f5, f6, f7);
+
+    ASSERT_EQUAL(false, f0.valid_stream());
+    ASSERT_EQUAL(false, f1.valid_stream());
+    ASSERT_EQUAL(false, f2.valid_stream());
+    ASSERT_EQUAL(false, f3.valid_stream());
+    ASSERT_EQUAL(false, f4.valid_stream());
+    ASSERT_EQUAL(false, f5.valid_stream());
+    ASSERT_EQUAL(false, f6.valid_stream());
+    ASSERT_EQUAL(false, f7.valid_stream());
+
+    ASSERT_EQUAL(false, f0.valid_content());
+    ASSERT_EQUAL(false, f1.valid_content());
+    ASSERT_EQUAL(false, f2.valid_content());
+    ASSERT_EQUAL(false, f3.valid_content());
+    ASSERT_EQUAL(false, f4.valid_content());
+    ASSERT_EQUAL(false, f5.valid_content());
+    ASSERT_EQUAL(false, f6.valid_content());
+    ASSERT_EQUAL(false, f7.valid_content());
+
+    ASSERT_EQUAL(true,  e0.valid_stream());
+
+    ASSERT_EQUAL(f0_stream, e0.stream().native_handle());
+
+    TEST_EVENT_WAIT(e0);
+
+    ASSERT_EQUAL(false, f0.ready());
+    ASSERT_EQUAL(false, f1.ready());
+    ASSERT_EQUAL(false, f2.ready());
+    ASSERT_EQUAL(false, f3.ready());
+    ASSERT_EQUAL(false, f4.ready());
+    ASSERT_EQUAL(false, f5.ready());
+    ASSERT_EQUAL(false, f6.ready());
+    ASSERT_EQUAL(false, f7.ready());
+
+    ASSERT_EQUAL(true,  e0.ready());
+  }
+};
+DECLARE_GENERIC_UNITTEST_WITH_TYPES(
+  test_future_when_all
+, future_value_types
+);
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif
+
diff --git a/testing/gather.cu b/testing/gather.cu
index 1fd70e427..c164e44b2 100644
--- a/testing/gather.cu
+++ b/testing/gather.cu
@@ -7,14 +7,12 @@
 #include <thrust/sequence.h>
 #include <algorithm>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 
 template <class Vector>
 void TestGatherSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector map(5);  // gather indices
     Vector src(8);  // source vector
     Vector dst(5);  // destination vector
@@ -31,7 +29,7 @@ void TestGatherSimple(void)
     ASSERT_EQUAL(dst[3], 7);
     ASSERT_EQUAL(dst[4], 2);
 }
-DECLARE_VECTOR_UNITTEST(TestGatherSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherSimple);
 
 
 template<typename InputIterator, typename RandomAccessIterator, typename OutputIterator>
@@ -141,8 +139,6 @@ DECLARE_VARIABLE_UNITTEST(TestGatherToDiscardIterator);
 template <class Vector>
 void TestGatherIfSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector flg(5);  // predicate array
     Vector map(5);  // gather indices
     Vector src(8);  // source vector
@@ -161,7 +157,7 @@ void TestGatherIfSimple(void)
     ASSERT_EQUAL(dst[3], 7);
     ASSERT_EQUAL(dst[4], 0);
 }
-DECLARE_VECTOR_UNITTEST(TestGatherIfSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherIfSimple);
 
 template <typename T>
 struct is_even_gather_if
@@ -178,10 +174,10 @@ template<typename InputIterator1,
          typename RandomAccessIterator,
          typename OutputIterator>
 OutputIterator gather_if(my_system &system,
-                         InputIterator1       map_first,
-                         InputIterator1       map_last,
-                         InputIterator2       stencil,
-                         RandomAccessIterator input_first,
+                         InputIterator1, //       map_first,
+                         InputIterator1, //       map_last,
+                         InputIterator2, //       stencil,
+                         RandomAccessIterator, // input_first,
                          OutputIterator       result)
 {
     system.validate_dispatch();
@@ -210,10 +206,10 @@ template<typename InputIterator1,
          typename RandomAccessIterator,
          typename OutputIterator>
 OutputIterator gather_if(my_tag,
-                         InputIterator1       map_first,
-                         InputIterator1       map_last,
-                         InputIterator2       stencil,
-                         RandomAccessIterator input_first,
+                         InputIterator1, //       map_first,
+                         InputIterator1, //       map_last,
+                         InputIterator2, //       stencil,
+                         RandomAccessIterator, // input_first,
                          OutputIterator       result)
 {
     *result = 13;
@@ -315,8 +311,6 @@ DECLARE_VARIABLE_UNITTEST(TestGatherIfToDiscardIterator);
 template <typename Vector>
 void TestGatherCountingIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector source(10);
     thrust::sequence(source.begin(), source.end(), 0);
 
@@ -352,6 +346,6 @@ void TestGatherCountingIterator(void)
 
     ASSERT_EQUAL(output, map);
 }
-DECLARE_VECTOR_UNITTEST(TestGatherCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestGatherCountingIterator);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/generate.cu b/testing/generate.cu
index 762c39558..fefd7d8e6 100644
--- a/testing/generate.cu
+++ b/testing/generate.cu
@@ -3,7 +3,7 @@
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/retag.h>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
 template<typename T>
 struct return_value
@@ -40,7 +40,7 @@ DECLARE_VECTOR_UNITTEST(TestGenerateSimple);
 
 
 template<typename ForwardIterator, typename Generator>
-void generate(my_system &system, ForwardIterator first, ForwardIterator, Generator)
+void generate(my_system &system, ForwardIterator /*first*/, ForwardIterator, Generator)
 {
     system.validate_dispatch();
 }
@@ -92,7 +92,7 @@ void TestGenerate(const size_t n)
 DECLARE_VARIABLE_UNITTEST(TestGenerate);
 
 template <typename T>
-void TestGenerateToDiscardIterator(const size_t n)
+void TestGenerateToDiscardIterator(const size_t)
 {
     T value = 13;
     return_value<T> f(value);
@@ -224,4 +224,4 @@ void TestGenerateTuple(void)
 };
 DECLARE_UNITTEST(TestGenerateTuple);
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
diff --git a/testing/generate_const_iterators.cu b/testing/generate_const_iterators.cu
new file mode 100644
index 000000000..fd12bfb3b
--- /dev/null
+++ b/testing/generate_const_iterators.cu
@@ -0,0 +1,29 @@
+#include <unittest/runtime_static_assert.h>
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+
+struct generator
+{
+    __host__ __device__
+    int operator()() const
+    {
+        return 1;
+    }
+};
+
+void TestGenerateConstIteratorCompilationError()
+{
+    thrust::host_vector<int> test1(10);
+
+    ASSERT_STATIC_ASSERT(thrust::generate(test1.cbegin(), test1.cend(), generator()));
+    ASSERT_STATIC_ASSERT(thrust::generate_n(test1.cbegin(), 10, generator()));
+}
+DECLARE_UNITTEST(TestGenerateConstIteratorCompilationError);
+
+void TestFillConstIteratorCompilationError()
+{
+    thrust::host_vector<int> test1(10);
+    ASSERT_STATIC_ASSERT(thrust::fill(test1.cbegin(), test1.cend(), 1));
+}
+DECLARE_UNITTEST(TestFillConstIteratorCompilationError);
+
diff --git a/testing/inner_product.cu b/testing/inner_product.cu
index c1f77904b..4fae72e88 100644
--- a/testing/inner_product.cu
+++ b/testing/inner_product.cu
@@ -1,6 +1,11 @@
 #include <unittest/unittest.h>
 #include <thrust/inner_product.h>
+
+#include <thrust/functional.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
+#include <thrust/device_vector.h>
 
 template <class Vector>
 void TestInnerProductSimple(void)
@@ -100,4 +105,69 @@ struct TestInnerProduct
 };
 VariableUnitTest<TestInnerProduct, IntegralTypes> TestInnerProductInstance;
 
+struct only_set_when_both_expected
+{
+    long long expected;
+    bool * flag;
+
+    __device__
+    long long operator()(long long x, long long y)
+    {
+        if (x == expected && y == expected)
+        {
+            *flag = true;
+        }
+
+        return x == y;
+    }
+};
+
+void TestInnerProductWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_both_expected fn = { (1ll << magnitude) - 1,
+        thrust::raw_pointer_cast(has_executed) };
 
+    ASSERT_EQUAL(thrust::inner_product(
+        thrust::device,
+        begin, end,
+        begin,
+        0ll,
+        thrust::plus<long long>(),
+        fn), (1ll << magnitude));
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInnerProductWithBigIndexes()
+{
+    TestInnerProductWithBigIndexesHelper(30);
+    TestInnerProductWithBigIndexesHelper(31);
+    TestInnerProductWithBigIndexesHelper(32);
+    TestInnerProductWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestInnerProductWithBigIndexes);
+
+void TestInnerProductPlaceholders()
+{ // Regression test for NVIDIA/thrust#1178
+  using namespace thrust::placeholders;
+
+  thrust::device_vector<float> v1(100, 1.f);
+  thrust::device_vector<float> v2(100, 1.f);
+
+  auto result = thrust::inner_product(v1.begin(), v1.end(), v2.begin(), 0.0f,
+                                      thrust::plus<float>{},
+                                      _1 * _2 + 1.0f);
+
+  ASSERT_ALMOST_EQUAL(result, 200.f);
+}
+DECLARE_UNITTEST(TestInnerProductPlaceholders);
diff --git a/testing/is_contiguous_iterator.cu b/testing/is_contiguous_iterator.cu
new file mode 100644
index 000000000..42a5aa663
--- /dev/null
+++ b/testing/is_contiguous_iterator.cu
@@ -0,0 +1,228 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/static_assert.h>
+#include <iterator>
+#include <vector>
+#if THRUST_CPP_DIALECT >= 2011
+  #include <array>
+  #include <unordered_map>
+  #include <unordered_set>
+#endif
+#include <string>
+#if THRUST_CPP_DIALECT >= 2017
+  #include <string_view>
+#endif
+#include <deque>
+#include <list>
+#include <map>
+#include <set>
+#include <thrust/device_ptr.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_vector.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::string::iterator
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::wstring::iterator
+>::value));
+
+#if THRUST_CPP_DIALECT >= 2017
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::string_view::iterator
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+  std::wstring_view::iterator
+>::value));
+#endif
+
+THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+  std::vector<bool>::iterator
+>::value));
+
+template <typename T>
+__host__
+void test_is_contiguous_iterator()
+{
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    T*
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    T const*
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    thrust::device_ptr<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename std::vector<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::vector<T>::reverse_iterator
+  >::value));
+
+  #if THRUST_CPP_DIALECT >= 2011
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename std::array<T, 1>::iterator
+  >::value));
+  #endif
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::list<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::deque<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::set<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::multiset<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::map<T, T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::multimap<T, T>::iterator
+  >::value));
+
+  #if THRUST_CPP_DIALECT >= 2011
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_set<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_multiset<T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_map<T, T>::iterator
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    typename std::unordered_multimap<T, T>::iterator
+  >::value));
+  #endif
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    std::istream_iterator<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_contiguous_iterator<
+    std::ostream_iterator<T>
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_contiguous_iterator);
+
+template <typename Vector>
+__host__
+void test_is_contiguous_iterator_vectors()
+{
+  THRUST_STATIC_ASSERT((thrust::is_contiguous_iterator<
+    typename Vector::iterator
+  >::value));
+}
+DECLARE_VECTOR_UNITTEST(test_is_contiguous_iterator_vectors);
+
+
+struct expect_pointer{};
+struct expect_passthrough{};
+
+template <typename IteratorT,
+          typename PointerT,
+          typename expected_unwrapped_type /* = expect_[pointer|passthrough] */>
+struct check_unwrapped_iterator
+{
+  using unwrapped_t = typename std::remove_reference<
+    decltype(thrust::detail::try_unwrap_contiguous_iterator(
+      std::declval<IteratorT>()))>::type;
+
+  static constexpr bool value =
+    std::is_same<expected_unwrapped_type, expect_pointer>::value
+      ? std::is_same<unwrapped_t, PointerT>::value
+      : std::is_same<unwrapped_t, IteratorT>::value;
+};
+
+template <typename T>
+void test_try_unwrap_contiguous_iterator()
+{
+  // Raw pointers should pass whether expecting pointers or passthrough.
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T *,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T *,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T const *,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<T const *,
+                                                 T const *,
+                                                 expect_passthrough>::value));
+
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<thrust::device_ptr<T>,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<thrust::device_ptr<T const>,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::vector<T>::iterator,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::vector<T>::reverse_iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::array<T, 1>::iterator,
+                                                 T *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::array<T const, 1>::iterator,
+                                                 T const *,
+                                                 expect_pointer>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::list<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::deque<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::set<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::multiset<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::map<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::multimap<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_set<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_multiset<T>::iterator,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_map<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<typename std::unordered_multimap<T, T>::iterator,
+                                                 std::pair<T const, T> *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<std::istream_iterator<T>,
+                                                 T *,
+                                                 expect_passthrough>::value));
+  THRUST_STATIC_ASSERT((check_unwrapped_iterator<std::ostream_iterator<T>,
+                                                 void,
+                                                 expect_passthrough>::value));
+}
+DECLARE_GENERIC_UNITTEST(test_try_unwrap_contiguous_iterator);
diff --git a/testing/is_operator_function_object.cu b/testing/is_operator_function_object.cu
new file mode 100644
index 000000000..935ee1e55
--- /dev/null
+++ b/testing/is_operator_function_object.cu
@@ -0,0 +1,195 @@
+#include <unittest/unittest.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/type_traits/is_operator_less_or_greater_function_object.h>
+#include <thrust/type_traits/is_operator_plus_function_object.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+  std::less<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+  std::greater<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+  std::less<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+  std::greater<>
+>::value));
+
+THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+  std::plus<>
+>::value));
+#endif
+
+template <typename T>
+__host__
+void test_is_operator_less_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_less_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_greater_function_object()
+{
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_greater_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_greater_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_greater_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_less_or_greater_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    thrust::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    thrust::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_less_or_greater_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    std::less_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    std::greater_equal<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_less_or_greater_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_less_or_greater_function_object);
+
+template <typename T>
+__host__
+void test_is_operator_plus_function_object()
+{
+  THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+    thrust::plus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::minus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    thrust::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((thrust::is_operator_plus_function_object<
+    std::plus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::minus<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::less<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    std::greater<T>
+  >::value));
+
+  THRUST_STATIC_ASSERT((!thrust::is_operator_plus_function_object<
+    T
+  >::value));
+}
+DECLARE_GENERIC_UNITTEST(test_is_operator_plus_function_object);
+
diff --git a/testing/is_partitioned.cu b/testing/is_partitioned.cu
index 0bdd10128..e503f32a3 100644
--- a/testing/is_partitioned.cu
+++ b/testing/is_partitioned.cu
@@ -14,7 +14,6 @@ template<typename Vector>
 void TestIsPartitionedSimple(void)
 {
   typedef typename Vector::value_type T;
-  typedef typename Vector::iterator Iterator;
 
   Vector v(4);
   v[0] = 1; v[1] = 1; v[2] = 1; v[3] = 0;
@@ -59,11 +58,11 @@ void TestIsPartitioned(void)
 
   ASSERT_EQUAL(true, thrust::is_partitioned(v.begin(), v.end(), is_even<T>()));
 }
-DECLARE_VECTOR_UNITTEST(TestIsPartitioned);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsPartitioned);
 
 
 template<typename InputIterator, typename Predicate>
-bool is_partitioned(my_system &system, InputIterator first, InputIterator, Predicate)
+bool is_partitioned(my_system &system, InputIterator /*first*/, InputIterator, Predicate)
 {
   system.validate_dispatch();
   return false;
diff --git a/testing/is_sorted.cu b/testing/is_sorted.cu
index 001becd7b..9edb7ed22 100644
--- a/testing/is_sorted.cu
+++ b/testing/is_sorted.cu
@@ -72,11 +72,11 @@ void TestIsSorted(void)
 
     ASSERT_EQUAL(thrust::is_sorted(v.begin(), v.end()), true);
 }
-DECLARE_VECTOR_UNITTEST(TestIsSorted);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsSorted);
 
 
 template<typename InputIterator>
-bool is_sorted(my_system &system, InputIterator first, InputIterator)
+bool is_sorted(my_system &system, InputIterator /*first*/, InputIterator)
 {
   system.validate_dispatch();
   return false;
diff --git a/testing/is_sorted_until.cu b/testing/is_sorted_until.cu
index 9e1b50917..128395581 100644
--- a/testing/is_sorted_until.cu
+++ b/testing/is_sorted_until.cu
@@ -94,7 +94,7 @@ void TestIsSortedUntil(void)
 
     ASSERT_EQUAL_QUIET(v.end(), thrust::is_sorted_until(v.begin(), v.end()));
 }
-DECLARE_VECTOR_UNITTEST(TestIsSortedUntil);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestIsSortedUntil);
 
 
 template<typename ForwardIterator>
diff --git a/testing/logical.cu b/testing/logical.cu
index 9faf28710..0a2b6edc9 100644
--- a/testing/logical.cu
+++ b/testing/logical.cu
@@ -26,7 +26,7 @@ DECLARE_VECTOR_UNITTEST(TestAllOf);
 
 
 template <class InputIterator, class Predicate>
-bool all_of(my_system &system, InputIterator first, InputIterator last, Predicate pred)
+bool all_of(my_system &system, InputIterator, InputIterator, Predicate)
 {
     system.validate_dispatch();
     return false;
@@ -45,7 +45,7 @@ DECLARE_UNITTEST(TestAllOfDispatchExplicit);
 
 
 template <class InputIterator, class Predicate>
-bool all_of(my_tag, InputIterator first, InputIterator last, Predicate pred)
+bool all_of(my_tag, InputIterator first, InputIterator, Predicate)
 {
     *first = 13;
     return false;
@@ -86,7 +86,7 @@ DECLARE_VECTOR_UNITTEST(TestAnyOf);
 
 
 template <class InputIterator, class Predicate>
-bool any_of(my_system &system, InputIterator first, InputIterator last, Predicate pred)
+bool any_of(my_system &system, InputIterator, InputIterator, Predicate)
 {
     system.validate_dispatch();
     return false;
@@ -105,7 +105,7 @@ DECLARE_UNITTEST(TestAnyOfDispatchExplicit);
 
 
 template <class InputIterator, class Predicate>
-bool any_of(my_tag, InputIterator first, InputIterator last, Predicate pred)
+bool any_of(my_tag, InputIterator first, InputIterator, Predicate)
 {
     *first = 13;
     return false;
@@ -146,7 +146,7 @@ DECLARE_VECTOR_UNITTEST(TestNoneOf);
 
 
 template <class InputIterator, class Predicate>
-bool none_of(my_system &system, InputIterator first, InputIterator last, Predicate pred)
+bool none_of(my_system &system, InputIterator, InputIterator, Predicate)
 {
     system.validate_dispatch();
     return false;
@@ -165,7 +165,7 @@ DECLARE_UNITTEST(TestNoneOfDispatchExplicit);
 
 
 template <class InputIterator, class Predicate>
-bool none_of(my_tag, InputIterator first, InputIterator last, Predicate pred)
+bool none_of(my_tag, InputIterator first, InputIterator, Predicate)
 {
     *first = 13;
     return false;
diff --git a/testing/max_element.cu b/testing/max_element.cu
index 965f6067f..456239264 100644
--- a/testing/max_element.cu
+++ b/testing/max_element.cu
@@ -1,6 +1,8 @@
 #include <unittest/unittest.h>
 #include <thrust/extrema.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/functional.h>
 
 template <class Vector>
 void TestMaxElementSimple(void)
@@ -23,6 +25,30 @@ void TestMaxElementSimple(void)
 }
 DECLARE_VECTOR_UNITTEST(TestMaxElementSimple);
 
+template <class Vector>
+void TestMaxElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::max_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())), -1);
+    ASSERT_EQUAL( *thrust::max_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>()),
+          thrust::greater<T>()), -5);
+    
+}
+DECLARE_VECTOR_UNITTEST(TestMaxElementWithTransform);
+
 template<typename T>
 void TestMaxElement(const size_t n)
 {
@@ -79,3 +105,20 @@ void TestMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMaxElementDispatchImplicit);
 
+void TestMaxElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(*thrust::max_element(thrust::device, begin, end), (1ll << magnitude));
+}
+
+void TestMaxElementWithBigIndexes()
+{
+    TestMaxElementWithBigIndexesHelper(30);
+    TestMaxElementWithBigIndexesHelper(31);
+    TestMaxElementWithBigIndexesHelper(32);
+    TestMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMaxElementWithBigIndexes);
diff --git a/testing/memory.cu b/testing/memory.cu
index 6dadf5f9d..e4c1da8f6 100644
--- a/testing/memory.cu
+++ b/testing/memory.cu
@@ -46,6 +46,68 @@ class my_memory_system : public thrust::device_execution_policy<my_memory_system
     my_memory_system();
 };
 
+namespace my_old_namespace
+{
+
+struct my_old_temporary_allocation_system
+  : public thrust::device_execution_policy<my_old_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_old_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_old_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_old_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(4217));
+
+  return thrust::make_pair(result, 314);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_old_temporary_allocation_system, Pointer p)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(4217));
+}
+
+} // my_old_namespace
+
+namespace my_new_namespace
+{
+
+struct my_new_temporary_allocation_system
+  : public thrust::device_execution_policy<my_new_temporary_allocation_system>
+{
+};
+
+template <typename T>
+thrust::pair<thrust::pointer<T, my_new_temporary_allocation_system>, std::ptrdiff_t>
+get_temporary_buffer(my_new_temporary_allocation_system, std::ptrdiff_t)
+{
+  thrust::pointer<T, my_new_temporary_allocation_system> const
+    result(reinterpret_cast<T*>(1742));
+
+  return thrust::make_pair(result, 413);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer)
+{
+  // This should never be called (the three-argument with size overload below
+  // should be preferred) and shouldn't be ambiguous.
+  ASSERT_EQUAL(true, false);
+}
+
+template<typename Pointer>
+void return_temporary_buffer(my_new_temporary_allocation_system, Pointer p, std::ptrdiff_t n)
+{
+  typedef typename thrust::detail::pointer_traits<Pointer>::raw_pointer RP;
+  ASSERT_EQUAL(p.get(), reinterpret_cast<RP>(1742));
+  ASSERT_EQUAL(n, 413);
+}
+
+} // my_new_namespace
 
 template<typename T1, typename T2>
 bool are_same(const T1 &, const T2 &)
@@ -104,7 +166,7 @@ DECLARE_UNITTEST(TestSelectSystemSameTypes);
 
 void TestGetTemporaryBuffer()
 {
-  const size_t n = 9001;
+  const std::ptrdiff_t n = 9001;
 
   thrust::device_system_tag dev_tag;
   typedef thrust::pointer<int, thrust::device_system_tag> pointer;
@@ -119,14 +181,14 @@ void TestGetTemporaryBuffer()
 
   ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-  thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first);
+  thrust::return_temporary_buffer(dev_tag, ptr_and_sz.first, ptr_and_sz.second);
 }
 DECLARE_UNITTEST(TestGetTemporaryBuffer);
 
 
 void TestMalloc()
 {
-  const size_t n = 9001;
+  const std::ptrdiff_t n = 9001;
 
   thrust::device_system_tag dev_tag;
   typedef thrust::pointer<int, thrust::device_system_tag> pointer;
@@ -198,12 +260,7 @@ template<typename T>
 
 void TestGetTemporaryBufferDispatchExplicit()
 {
-#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
-  // gcc 4.2 does not do adl correctly for get_temporary_buffer
-  // gcc 4.3 does not do adl correctly for malloc
-  KNOWN_FAILURE;
-#else
-  const size_t n = 9001;
+  const std::ptrdiff_t n = 9001;
 
   my_memory_system sys(0);
   typedef thrust::pointer<int, thrust::device_system_tag> pointer;
@@ -219,8 +276,7 @@ void TestGetTemporaryBufferDispatchExplicit()
 
   ASSERT_EQUAL(true, thrust::all_of(ptr_and_sz.first, ptr_and_sz.first + n, thrust::placeholders::_1 == ref_val));
 
-  thrust::return_temporary_buffer(sys, ptr_and_sz.first);
-#endif
+  thrust::return_temporary_buffer(sys, ptr_and_sz.first, ptr_and_sz.second);
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDispatchExplicit);
 
@@ -234,11 +290,6 @@ void TestGetTemporaryBufferDispatchImplicit()
   }
   else
   {
-#if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION < 40400)
-    // gcc 4.2 does not do adl correctly for get_temporary_buffer
-    // gcc 4.3 does not do adl correctly for malloc
-    KNOWN_FAILURE;
-#else
     thrust::device_vector<int> vec(9001);
 
     thrust::sequence(vec.begin(), vec.end());
@@ -250,8 +301,48 @@ void TestGetTemporaryBufferDispatchImplicit()
 
     ASSERT_EQUAL(true, thrust::is_sorted(vec.begin(), vec.end()));
     ASSERT_EQUAL(true, sys.is_valid());
-#endif
   }
 }
 DECLARE_UNITTEST(TestGetTemporaryBufferDispatchImplicit);
 
+
+void TestTemporaryBufferOldCustomization()
+{
+  typedef my_old_namespace::my_old_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_old_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(4217));
+    ASSERT_EQUAL(ps.second, 314);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferOldCustomization);
+
+
+void TestTemporaryBufferNewCustomization()
+{
+  typedef my_new_namespace::my_new_temporary_allocation_system system;
+  typedef thrust::pointer<int, system> pointer;
+  typedef thrust::pair<pointer, std::ptrdiff_t> pointer_and_size;
+
+  system sys;
+
+  {
+    pointer_and_size ps = thrust::get_temporary_buffer<int>(sys, 0);
+
+    // The magic values are defined in `my_new_namespace` above.
+    ASSERT_EQUAL(ps.first.get(), reinterpret_cast<int*>(1742));
+    ASSERT_EQUAL(ps.second, 413);
+
+    thrust::return_temporary_buffer(sys, ps.first, ps.second);
+  }
+}
+DECLARE_UNITTEST(TestTemporaryBufferNewCustomization);
diff --git a/testing/metaprogamming.cu b/testing/metaprogamming.cu
index 53a7d8994..32f0a2e20 100644
--- a/testing/metaprogamming.cu
+++ b/testing/metaprogamming.cu
@@ -5,24 +5,24 @@ void TestLog2(void)
 {
     unsigned int result;
     
-    result = thrust::detail::mpl::math::log2<  1>::value;   ASSERT_EQUAL(result, 0);
-    result = thrust::detail::mpl::math::log2<  2>::value;   ASSERT_EQUAL(result, 1);
-    result = thrust::detail::mpl::math::log2<  3>::value;   ASSERT_EQUAL(result, 1);
-    result = thrust::detail::mpl::math::log2<  4>::value;   ASSERT_EQUAL(result, 2);
-    result = thrust::detail::mpl::math::log2<  5>::value;   ASSERT_EQUAL(result, 2);
-    result = thrust::detail::mpl::math::log2<  6>::value;   ASSERT_EQUAL(result, 2);
-    result = thrust::detail::mpl::math::log2<  7>::value;   ASSERT_EQUAL(result, 2);
-    result = thrust::detail::mpl::math::log2<  8>::value;   ASSERT_EQUAL(result, 3);
-    result = thrust::detail::mpl::math::log2<  9>::value;   ASSERT_EQUAL(result, 3);
-    result = thrust::detail::mpl::math::log2< 15>::value;   ASSERT_EQUAL(result, 3);
-    result = thrust::detail::mpl::math::log2< 16>::value;   ASSERT_EQUAL(result, 4);
-    result = thrust::detail::mpl::math::log2< 17>::value;   ASSERT_EQUAL(result, 4);
-    result = thrust::detail::mpl::math::log2<127>::value;   ASSERT_EQUAL(result, 6);
-    result = thrust::detail::mpl::math::log2<128>::value;   ASSERT_EQUAL(result, 7);
-    result = thrust::detail::mpl::math::log2<129>::value;   ASSERT_EQUAL(result, 7);
-    result = thrust::detail::mpl::math::log2<256>::value;   ASSERT_EQUAL(result, 8);
-    result = thrust::detail::mpl::math::log2<511>::value;   ASSERT_EQUAL(result, 8);
-    result = thrust::detail::mpl::math::log2<512>::value;   ASSERT_EQUAL(result, 9);
+    result = thrust::detail::mpl::math::log2<  1>::value;   ASSERT_EQUAL(result, 0lu);
+    result = thrust::detail::mpl::math::log2<  2>::value;   ASSERT_EQUAL(result, 1lu);
+    result = thrust::detail::mpl::math::log2<  3>::value;   ASSERT_EQUAL(result, 1lu);
+    result = thrust::detail::mpl::math::log2<  4>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  5>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  6>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  7>::value;   ASSERT_EQUAL(result, 2lu);
+    result = thrust::detail::mpl::math::log2<  8>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2<  9>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2< 15>::value;   ASSERT_EQUAL(result, 3lu);
+    result = thrust::detail::mpl::math::log2< 16>::value;   ASSERT_EQUAL(result, 4lu);
+    result = thrust::detail::mpl::math::log2< 17>::value;   ASSERT_EQUAL(result, 4lu);
+    result = thrust::detail::mpl::math::log2<127>::value;   ASSERT_EQUAL(result, 6lu);
+    result = thrust::detail::mpl::math::log2<128>::value;   ASSERT_EQUAL(result, 7lu);
+    result = thrust::detail::mpl::math::log2<129>::value;   ASSERT_EQUAL(result, 7lu);
+    result = thrust::detail::mpl::math::log2<256>::value;   ASSERT_EQUAL(result, 8lu);
+    result = thrust::detail::mpl::math::log2<511>::value;   ASSERT_EQUAL(result, 8lu);
+    result = thrust::detail::mpl::math::log2<512>::value;   ASSERT_EQUAL(result, 9lu);
 }
 DECLARE_UNITTEST(TestLog2);
 
diff --git a/testing/min_element.cu b/testing/min_element.cu
index 21bd4ebf2..81fedbdab 100644
--- a/testing/min_element.cu
+++ b/testing/min_element.cu
@@ -23,6 +23,30 @@ void TestMinElementSimple(void)
 }
 DECLARE_VECTOR_UNITTEST(TestMinElementSimple);
 
+template <class Vector>
+void TestMinElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::min_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())), -5);
+    ASSERT_EQUAL( *thrust::min_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>()),
+          thrust::greater<T>()), -1);
+    
+}
+DECLARE_VECTOR_UNITTEST(TestMinElementWithTransform);
+
 template<typename T>
 void TestMinElement(const size_t n)
 {
@@ -79,3 +103,22 @@ void TestMinElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinElementDispatchImplicit);
 
+void TestMinElementWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(1);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    ASSERT_EQUAL(
+        *thrust::min_element(thrust::device, begin, end, thrust::greater<long long>()),
+        (1ll << magnitude));
+}
+
+void TestMinElementWithBigIndexes()
+{
+    TestMinElementWithBigIndexesHelper(30);
+    TestMinElementWithBigIndexesHelper(31);
+    TestMinElementWithBigIndexesHelper(32);
+    TestMinElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinElementWithBigIndexes);
diff --git a/testing/minmax_element.cu b/testing/minmax_element.cu
index 2aae8d24f..4a87f5bb4 100644
--- a/testing/minmax_element.cu
+++ b/testing/minmax_element.cu
@@ -5,8 +5,6 @@
 template <class Vector>
 void TestMinMaxElementSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data(6);
     data[0] = 3;
     data[1] = 5;
@@ -21,6 +19,29 @@ void TestMinMaxElementSimple(void)
     ASSERT_EQUAL(  thrust::minmax_element(data.begin(), data.end()).second - data.begin(), 1);
 }
 DECLARE_VECTOR_UNITTEST(TestMinMaxElementSimple);
+  
+template <class Vector>
+void TestMinMaxElementWithTransform(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(6);
+    data[0] = 3;
+    data[1] = 5;
+    data[2] = 1;
+    data[3] = 2;
+    data[4] = 5;
+    data[5] = 1;
+
+    ASSERT_EQUAL( *thrust::minmax_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())).first, -5);
+    ASSERT_EQUAL( *thrust::minmax_element(
+          thrust::make_transform_iterator(data.begin(), thrust::negate<T>()),
+          thrust::make_transform_iterator(data.end(),   thrust::negate<T>())).second, -1);
+}
+DECLARE_VECTOR_UNITTEST(TestMinMaxElementWithTransform);
+
 
 template<typename T>
 void TestMinMaxElement(const size_t n)
@@ -89,3 +110,29 @@ void TestMinMaxElementDispatchImplicit()
 }
 DECLARE_UNITTEST(TestMinMaxElementDispatchImplicit);
 
+void TestMinMaxElementWithBigIndexesHelper(int magnitude)
+{
+    typedef thrust::counting_iterator<long long> Iter;
+    Iter begin(1);
+    Iter end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::pair<Iter, Iter> result = thrust::minmax_element(
+        thrust::device, begin, end);
+    ASSERT_EQUAL(*result.first, 1);
+    ASSERT_EQUAL(*result.second, (1ll << magnitude));
+
+    result = thrust::minmax_element(thrust::device, begin, end,
+        thrust::greater<long long>());
+    ASSERT_EQUAL(*result.second, 1);
+    ASSERT_EQUAL(*result.first, (1ll << magnitude));
+}
+
+void TestMinMaxElementWithBigIndexes()
+{
+    TestMinMaxElementWithBigIndexesHelper(30);
+    TestMinMaxElementWithBigIndexesHelper(31);
+    TestMinMaxElementWithBigIndexesHelper(32);
+    TestMinMaxElementWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestMinMaxElementWithBigIndexes);
diff --git a/testing/mismatch.cu b/testing/mismatch.cu
index 679a70dc3..9c2ce351a 100644
--- a/testing/mismatch.cu
+++ b/testing/mismatch.cu
@@ -5,8 +5,6 @@
 template <class Vector>
 void TestMismatchSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector a(4); Vector b(4);
     a[0] = 1; b[0] = 1;
     a[1] = 2; b[1] = 2;
diff --git a/testing/mr_disjoint_pool.cu b/testing/mr_disjoint_pool.cu
new file mode 100644
index 000000000..69a6005ec
--- /dev/null
+++ b/testing/mr_disjoint_pool.cu
@@ -0,0 +1,301 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/disjoint_pool.h>
+#include <thrust/mr/new.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/mr/disjoint_sync_pool.h>
+#endif
+
+struct alloc_id
+{
+    std::size_t id;
+    std::size_t size;
+    std::size_t alignment;
+    std::size_t offset;
+
+    __host__ __device__
+    bool operator==(const alloc_id & other) const
+    {
+        return id == other.id && size == other.size && alignment == other.alignment;
+    }
+
+    alloc_id operator+(std::size_t size_) const
+    {
+        alloc_id ret;
+        ret.id = id;
+        ret.size = size_;
+        ret.alignment = alignment;
+        ret.offset = size_;
+        return ret;
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+namespace detail {
+template<>
+struct pointer_traits<alloc_id>
+{
+    template<typename>
+    struct rebind
+    {
+        typedef alloc_id other;
+    };
+
+    // implemented for the purposes of alignment test in disjoint pool's do_deallocate
+    static void * get(const alloc_id & id)
+    {
+        return reinterpret_cast<void *>(id.alignment);
+    }
+};
+
+} // end namespace detail
+
+THRUST_NAMESPACE_END
+
+class dummy_resource final : public thrust::mr::memory_resource<alloc_id>
+{
+public:
+    dummy_resource() : id_to_allocate(0), id_to_deallocate(0)
+    {
+    }
+
+    ~dummy_resource()
+    {
+        ASSERT_EQUAL(id_to_allocate, 0u);
+        ASSERT_EQUAL(id_to_deallocate, 0u);
+    }
+
+    virtual alloc_id do_allocate(std::size_t bytes, std::size_t alignment) override
+    {
+        ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
+
+        alloc_id ret;
+        ret.id = id_to_allocate;
+        ret.size = bytes;
+        ret.alignment = alignment;
+
+        id_to_allocate = 0;
+
+        return ret;
+    }
+
+    virtual void do_deallocate(alloc_id p, std::size_t bytes, std::size_t alignment) override
+    {
+        ASSERT_EQUAL(p.size, bytes);
+        ASSERT_EQUAL(p.alignment, alignment);
+
+        if (id_to_deallocate != 0)
+        {
+            ASSERT_EQUAL(p.id, id_to_deallocate);
+            id_to_deallocate = 0;
+        }
+    }
+
+    std::size_t id_to_allocate;
+    std::size_t id_to_deallocate;
+};
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointPool()
+{
+    dummy_resource upstream;
+    thrust::mr::new_delete_resource bookkeeper;
+
+    typedef PoolTemplate<
+        dummy_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = false;
+
+    // avoid having the destructor run when an assertion failure is raised
+    // (the destructor will try to release, which in turn calls do_deallocate,
+    // which may fail with an assertion failure exception...)
+    Pool * pool = new Pool(&upstream, &bookkeeper, opts);
+
+    upstream.id_to_allocate = 1;
+
+    // first allocation
+    alloc_id a1 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    // due to chunking, the above allocation should be enough for the next one too
+    alloc_id a2 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a2.id, 1u);
+
+    // deallocating and allocating back should give the same resource back
+    pool->do_deallocate(a1, 12, THRUST_MR_DEFAULT_ALIGNMENT);
+    alloc_id a3 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, a3.id);
+    ASSERT_EQUAL(a1.size, a3.size);
+    ASSERT_EQUAL(a1.alignment, a3.alignment);
+    ASSERT_EQUAL(a1.offset, a3.offset);
+
+    // allocating over-aligned memory should give non-cached results
+    upstream.id_to_allocate = 2;
+    alloc_id a4 = pool->do_allocate(32, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(a4.id, 2u);
+    ASSERT_EQUAL(a4.size, 32u);
+    ASSERT_EQUAL(a4.alignment, (std::size_t)THRUST_MR_DEFAULT_ALIGNMENT * 2);
+
+    // and deallocating it should return it back to upstream
+    upstream.id_to_deallocate = 2;
+    pool->do_deallocate(a4, 32u, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // release actually returns properly sized memory to upstream
+    upstream.id_to_deallocate = 1;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and does the same for oversized/overaligned memory
+    upstream.id_to_allocate = 3;
+    alloc_id a5 = pool->do_allocate(1024, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    upstream.id_to_deallocate = 3;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and after that, the formerly cached memory isn't used anymore,
+    // so new memory from upstream is returned back
+    upstream.id_to_allocate = 4;
+    alloc_id a6 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    // destruction also returns memory
+    upstream.id_to_deallocate = 4;
+
+    // actually destroy the pool; reasons why RAII is not used outlined at the beginning
+    // of this function
+    delete pool;
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+}
+
+void TestDisjointUnsynchronizedPool()
+{
+    TestDisjointPool<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointUnsynchronizedPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestDisjointSynchronizedPool()
+{
+    TestDisjointPool<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointSynchronizedPool);
+#endif
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointPoolCachingOversized()
+{
+    dummy_resource upstream;
+    thrust::mr::new_delete_resource bookkeeper;
+
+    typedef PoolTemplate<
+        dummy_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = true;
+    opts.largest_block_size = 1024;
+
+    Pool pool(&upstream, &bookkeeper, opts);
+
+    upstream.id_to_allocate = 1;
+    alloc_id a1 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    upstream.id_to_allocate = 2;
+    alloc_id a2 = pool.do_allocate(64, 32);
+    ASSERT_EQUAL(a2.id, 2u);
+
+    pool.do_deallocate(a2, 64, 32);
+    pool.do_deallocate(a1, 2048, 32);
+
+    // make sure a good fit is used from the cache
+    alloc_id a3 = pool.do_allocate(32, 32);
+    ASSERT_EQUAL(a3.id, 2u);
+
+    alloc_id a4 = pool.do_allocate(1024, 32);
+    ASSERT_EQUAL(a4.id, 1u);
+
+    pool.do_deallocate(a4, 1024, 32);
+
+    // make sure that a new block is allocated when there's nothing cached with
+    // the required alignment
+    upstream.id_to_allocate = 3;
+    alloc_id a5 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    pool.release();
+
+    // make sure that release actually clears caches
+    upstream.id_to_allocate = 4;
+    alloc_id a6 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    upstream.id_to_allocate = 5;
+    alloc_id a7 = pool.do_allocate(2048, 1024);
+    ASSERT_EQUAL(a7.id, 5u);
+
+    pool.do_deallocate(a7, 2048, 1024);
+
+    // make sure that the 'ridiculousness' factor for size (options.cached_size_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 6;
+    alloc_id a8 = pool.do_allocate(24, 1024);
+    ASSERT_EQUAL(a8.id, 6u);
+
+    // make sure that the 'ridiculousness' factor for alignment (options.cached_alignment_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 7;
+    alloc_id a9 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a9.id, 7u);
+}
+
+void TestDisjointUnsynchronizedPoolCachingOversized()
+{
+    TestDisjointPoolCachingOversized<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointUnsynchronizedPoolCachingOversized);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestDisjointSynchronizedPoolCachingOversized()
+{
+    TestDisjointPoolCachingOversized<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestDisjointSynchronizedPoolCachingOversized);
+#endif
+
+template<template<typename, typename> class PoolTemplate>
+void TestDisjointGlobalPool()
+{
+    typedef PoolTemplate<
+        thrust::mr::new_delete_resource,
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    ASSERT_EQUAL(thrust::mr::get_global_resource<Pool>() != NULL, true);
+}
+
+void TestUnsynchronizedDisjointGlobalPool()
+{
+    TestDisjointGlobalPool<thrust::mr::disjoint_unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedDisjointGlobalPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedDisjointGlobalPool()
+{
+    TestDisjointGlobalPool<thrust::mr::disjoint_synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedDisjointGlobalPool);
+#endif
+
diff --git a/testing/mr_new.cu b/testing/mr_new.cu
new file mode 100644
index 000000000..02f34eccf
--- /dev/null
+++ b/testing/mr_new.cu
@@ -0,0 +1,36 @@
+#include <unittest/unittest.h>
+#include <thrust/mr/new.h>
+#include <thrust/fill.h>
+
+template<typename MemoryResource>
+void TestAlignment(MemoryResource memres, std::size_t size, std::size_t alignment)
+{
+    void * ptr = memres.do_allocate(size, alignment);
+    ASSERT_EQUAL(reinterpret_cast<std::size_t>(ptr) % alignment, 0u);
+
+    char * char_ptr = reinterpret_cast<char *>(ptr);
+    thrust::fill(char_ptr, char_ptr + size, char{});
+
+    memres.do_deallocate(ptr, size, alignment);
+}
+
+static const std::size_t MinTestedSize = 32;
+static const std::size_t MaxTestedSize = 8 * 1024;
+static const std::size_t TestedSizeStep = 1;
+
+static const std::size_t MinTestedAlignment = 16;
+static const std::size_t MaxTestedAlignment = 4 * 1024;
+static const std::size_t TestedAlignmentShift = 1;
+
+void TestNewDeleteResourceAlignedAllocation()
+{
+    for (std::size_t size = MinTestedSize; size <= MaxTestedSize; size += TestedSizeStep)
+    {
+        for (std::size_t alignment = MinTestedAlignment; alignment <= MaxTestedAlignment;
+            alignment <<= TestedAlignmentShift)
+        {
+            TestAlignment(thrust::mr::new_delete_resource(), size, alignment);
+        }
+    }
+}
+DECLARE_UNITTEST(TestNewDeleteResourceAlignedAllocation);
diff --git a/testing/mr_pool.cu b/testing/mr_pool.cu
new file mode 100644
index 000000000..30c1f18a4
--- /dev/null
+++ b/testing/mr_pool.cu
@@ -0,0 +1,360 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/pool.h>
+#include <thrust/mr/new.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <thrust/mr/sync_pool.h>
+#endif
+
+template<typename T>
+struct reference
+{
+    typedef T & type;
+};
+
+template<>
+struct reference<void>
+{
+    typedef void type;
+};
+
+struct unit {};
+
+template<typename T>
+struct tracked_pointer : thrust::iterator_facade<
+                            tracked_pointer<T>,
+                            T,
+                            thrust::host_system_tag,
+                            thrust::random_access_traversal_tag,
+                            typename reference<T>::type,
+                            std::ptrdiff_t
+                         >
+{
+    typedef T * raw_pointer;
+
+    std::size_t id;
+    std::size_t size;
+    std::size_t alignment;
+    std::size_t offset;
+    void * ptr;
+
+    __host__ __device__
+    explicit tracked_pointer(T * ptr = NULL) : id(), size(), alignment(), offset(), ptr(ptr)
+    {
+    }
+
+    __host__ __device__
+    ~tracked_pointer()
+    {
+    }
+
+    template<typename U>
+    operator tracked_pointer<U>() const
+    {
+        tracked_pointer<U> ret;
+        ret.id = id;
+        ret.size = size;
+        ret.alignment = alignment;
+        ret.offset = offset;
+        ret.ptr = ptr;
+        return ret;
+    }
+
+    __host__ __device__
+    std::ptrdiff_t distance_to(const tracked_pointer & other) const
+    {
+        return static_cast<T *>(other.ptr) - static_cast<T *>(ptr);
+    }
+
+    __host__ __device__
+    T * get() const
+    {
+        return static_cast<T *>(ptr);
+    }
+
+    // globally qualified, because MSVC somehow prefers the name from the dependent base
+    // of this class over the `reference` template that's visible in the global namespace of this file...
+    __host__ __device__
+    typename ::reference<T>::type dereference() const
+    {
+        return *get();
+    }
+
+    __host__ __device__
+    void increment()
+    {
+        advance(1);
+    }
+
+    __host__ __device__
+    void decrement()
+    {
+        advance(-1);
+    }
+
+    __host__ __device__
+    void advance(std::ptrdiff_t diff)
+    {
+        ptr = get() + diff;
+        offset += diff * sizeof(T);
+    }
+
+    __host__ __device__
+    bool equal(const tracked_pointer & other) const
+    {
+        return id == other.id && size == other.size && alignment == other.alignment && offset == other.offset && ptr == other.ptr;
+    }
+};
+
+class tracked_resource final : public thrust::mr::memory_resource<tracked_pointer<void> >
+{
+public:
+    tracked_resource() : id_to_allocate(0), id_to_deallocate(0)
+    {
+    }
+
+    ~tracked_resource()
+    {
+        ASSERT_EQUAL(id_to_allocate, 0u);
+        ASSERT_EQUAL(id_to_deallocate, 0u);
+    }
+
+    virtual tracked_pointer<void> do_allocate(std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        ASSERT_EQUAL(static_cast<bool>(id_to_allocate), true);
+
+        void * raw = upstream.do_allocate(n, alignment);
+        tracked_pointer<void> ret(raw);
+        ret.id = id_to_allocate;
+        ret.size = n;
+        ret.alignment = alignment;
+
+        id_to_allocate = 0;
+
+        return ret;
+    }
+
+    virtual void do_deallocate(tracked_pointer<void> p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        ASSERT_EQUAL(p.size, n);
+        ASSERT_EQUAL(p.alignment, alignment);
+
+        if (id_to_deallocate != 0)
+        {
+            ASSERT_EQUAL(p.id, id_to_deallocate);
+            id_to_deallocate = 0;
+        }
+
+        upstream.do_deallocate(p.ptr, n, alignment);
+    }
+
+    std::size_t id_to_allocate;
+    std::size_t id_to_deallocate;
+
+private:
+    thrust::mr::new_delete_resource upstream;
+};
+
+template<template<typename> class PoolTemplate>
+void TestPool()
+{
+    tracked_resource upstream;
+
+    upstream.id_to_allocate = -1u;
+
+    typedef PoolTemplate<
+        tracked_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = false;
+
+    // avoid having the destructor run when an assertion failure is raised
+    // (the destructor will try to release, which in turn calls do_deallocate,
+    // which may fail with an assertion failure exception...)
+    Pool * pool = new Pool(&upstream, opts);
+
+    upstream.id_to_allocate = 1;
+
+    // first allocation
+    tracked_pointer<void> a1 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    // due to chunking, the above allocation should be enough for the next one too
+    tracked_pointer<void> a2 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a2.id, 1u);
+
+    // deallocating and allocating back should give the same resource back
+    pool->do_deallocate(a1, 12, THRUST_MR_DEFAULT_ALIGNMENT);
+    tracked_pointer<void> a3 = pool->do_allocate(12, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(a1.id, a3.id);
+    ASSERT_EQUAL(a1.size, a3.size);
+    ASSERT_EQUAL(a1.alignment, a3.alignment);
+    ASSERT_EQUAL(a1.offset, a3.offset);
+
+    // allocating over-aligned memory should give non-cached results
+    // unlike with the disjoint version, nothing sensible can be said about the chunk size
+    upstream.id_to_allocate = 2;
+    tracked_pointer<void> a4 = pool->do_allocate(32, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(a4.id, 2u);
+    ASSERT_EQUAL(a4.alignment, (std::size_t)THRUST_MR_DEFAULT_ALIGNMENT * 2);
+
+    // and deallocating it should return it back to upstream
+    upstream.id_to_deallocate = 2;
+    pool->do_deallocate(a4, 32u, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // release actually returns properly sized memory to upstream
+    upstream.id_to_deallocate = 1;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and does the same for oversized/overaligned memory
+    upstream.id_to_allocate = 3;
+    tracked_pointer<void> a5 = pool->do_allocate(1024, THRUST_MR_DEFAULT_ALIGNMENT * 2);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    upstream.id_to_deallocate = 3;
+    pool->release();
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+
+    // and after that, the formerly cached memory isn't used anymore,
+    // so new memory from upstream is returned back
+    upstream.id_to_allocate = 4;
+    tracked_pointer<void> a6 = pool->do_allocate(16, THRUST_MR_DEFAULT_ALIGNMENT);
+    ASSERT_EQUAL(upstream.id_to_allocate, 0u);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    // destruction also returns memory
+    upstream.id_to_deallocate = 4;
+
+    // actually destroy the pool; reasons why RAII is not used outlined at the beginning
+    // of this function
+    delete pool;
+    ASSERT_EQUAL(upstream.id_to_deallocate, 0u);
+}
+
+void TestUnsynchronizedPool()
+{
+    TestPool<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedPool()
+{
+    TestPool<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedPool);
+#endif
+
+template<template<typename> class PoolTemplate>
+void TestPoolCachingOversized()
+{
+    tracked_resource upstream;
+
+    upstream.id_to_allocate = -1u;
+
+    typedef PoolTemplate<
+        tracked_resource
+    > Pool;
+
+    thrust::mr::pool_options opts = Pool::get_default_options();
+    opts.cache_oversized = true;
+    opts.largest_block_size = 1024;
+
+    Pool pool(&upstream, opts);
+
+    upstream.id_to_allocate = 1;
+    tracked_pointer<void> a1 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a1.id, 1u);
+
+    upstream.id_to_allocate = 2;
+    tracked_pointer<void> a2 = pool.do_allocate(64, 32);
+    ASSERT_EQUAL(a2.id, 2u);
+
+    pool.do_deallocate(a2, 64, 32);
+    pool.do_deallocate(a1, 2048, 32);
+
+    // make sure a good fit is used from the cache
+    tracked_pointer<void> a3 = pool.do_allocate(32, 32);
+    ASSERT_EQUAL(a3.id, 2u);
+
+    tracked_pointer<void> a4 = pool.do_allocate(1024, 32);
+    ASSERT_EQUAL(a4.id, 1u);
+
+    pool.do_deallocate(a4, 1024, 32);
+
+    // make sure that a new block is allocated when there's nothing cached with
+    // the required alignment
+    upstream.id_to_allocate = 3;
+    tracked_pointer<void> a5 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a5.id, 3u);
+
+    pool.release();
+
+    // make sure that release actually clears caches
+    upstream.id_to_allocate = 4;
+    tracked_pointer<void> a6 = pool.do_allocate(32, 64);
+    ASSERT_EQUAL(a6.id, 4u);
+
+    upstream.id_to_allocate = 5;
+    tracked_pointer<void> a7 = pool.do_allocate(2048, 1024);
+    ASSERT_EQUAL(a7.id, 5u);
+
+    pool.do_deallocate(a7, 2048, 1024);
+
+    // make sure that the 'ridiculousness' factor for size (options.cached_size_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 6;
+    tracked_pointer<void> a8 = pool.do_allocate(24, 1024);
+    ASSERT_EQUAL(a8.id, 6u);
+
+    // make sure that the 'ridiculousness' factor for alignment (options.cached_alignment_cutoff_factor)
+    // is respected
+    upstream.id_to_allocate = 7;
+    tracked_pointer<void> a9 = pool.do_allocate(2048, 32);
+    ASSERT_EQUAL(a9.id, 7u);
+}
+
+void TestUnsynchronizedPoolCachingOversized()
+{
+    TestPoolCachingOversized<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedPoolCachingOversized);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedPoolCachingOversized()
+{
+    TestPoolCachingOversized<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedPoolCachingOversized);
+#endif
+
+template<template<typename> class PoolTemplate>
+void TestGlobalPool()
+{
+    typedef PoolTemplate<
+        thrust::mr::new_delete_resource
+    > Pool;
+
+    ASSERT_EQUAL(thrust::mr::get_global_resource<Pool>() != NULL, true);
+}
+
+void TestUnsynchronizedGlobalPool()
+{
+    TestGlobalPool<thrust::mr::unsynchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestUnsynchronizedGlobalPool);
+
+#if THRUST_CPP_DIALECT >= 2011
+void TestSynchronizedGlobalPool()
+{
+    TestGlobalPool<thrust::mr::synchronized_pool_resource>();
+}
+DECLARE_UNITTEST(TestSynchronizedGlobalPool);
+#endif
+
diff --git a/testing/mr_pool_options.cu b/testing/mr_pool_options.cu
new file mode 100644
index 000000000..b53e336df
--- /dev/null
+++ b/testing/mr_pool_options.cu
@@ -0,0 +1,63 @@
+#include <unittest/unittest.h>
+#include <thrust/mr/pool_options.h>
+
+void TestPoolOptionsBasicValidity()
+{
+    thrust::mr::pool_options options = thrust::mr::pool_options();
+    ASSERT_EQUAL(options.validate(), false);
+
+    options.max_blocks_per_chunk = 1024;
+    options.max_bytes_per_chunk = 1024 * 1024;
+    options.smallest_block_size = 8;
+    options.largest_block_size = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the minimum number of blocks per chunk is bigger than the max
+    options.min_blocks_per_chunk = 1025;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_blocks_per_chunk = 128;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the minimum number of bytes per chunk is bigger than the max
+    options.min_bytes_per_chunk = 1025 * 1024;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_bytes_per_chunk = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // smallest block size is bigger than the largest block size
+    options.smallest_block_size = 2048;
+    ASSERT_EQUAL(options.validate(), false);
+    options.smallest_block_size = 8;
+    ASSERT_EQUAL(options.validate(), true);
+}
+DECLARE_UNITTEST(TestPoolOptionsBasicValidity);
+
+void TestPoolOptionsComplexValidity()
+{
+    thrust::mr::pool_options options = thrust::mr::pool_options();
+    ASSERT_EQUAL(options.validate(), false);
+
+    options.max_blocks_per_chunk = 1024;
+    options.max_bytes_per_chunk = 1024 * 1024;
+    options.smallest_block_size = 8;
+    options.largest_block_size = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    options.min_bytes_per_chunk = 2 * 1024;
+    options.max_bytes_per_chunk = 256 * 1024;
+
+    // the biggest allowed allocation (deduced from blocks in chunks)
+    // is smaller than the minimal allowed one (defined in bytes)
+    options.max_blocks_per_chunk = 1;
+    ASSERT_EQUAL(options.validate(), false);
+    options.max_blocks_per_chunk = 1024;
+    ASSERT_EQUAL(options.validate(), true);
+
+    // the smallest allowed allocation (deduced from blocks in chunks)
+    // is bigger than the maximum allowed one (defined in bytes)
+    options.min_blocks_per_chunk = 1024 * 1024;
+    ASSERT_EQUAL(options.validate(), false);
+    options.min_blocks_per_chunk = 128;
+    ASSERT_EQUAL(options.validate(), true);
+}
+DECLARE_UNITTEST(TestPoolOptionsComplexValidity);
diff --git a/testing/namespace_wrapped.cu b/testing/namespace_wrapped.cu
new file mode 100644
index 000000000..b6bcb3dbb
--- /dev/null
+++ b/testing/namespace_wrapped.cu
@@ -0,0 +1,43 @@
+// Wrap thrust and cub in different enclosing namespaces
+// (In practice, you probably want these to be the same, in which case just
+// set THRUST_CUB_WRAPPED_NAMESPACE to set both).
+#define THRUST_WRAPPED_NAMESPACE wrap_thrust
+#define CUB_WRAPPED_NAMESPACE    wrap_cub
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+#include <unittest/unittest.h>
+
+// Test that we can use a few common utilities and algorithms from a wrapped
+// namespace at runtime. More extensive testing is performed by the header
+// tests and the check_namespace.cmake test.
+void TestWrappedNamespace()
+{
+  const std::size_t n = 2048;
+
+  const auto in_1_begin =
+    ::wrap_thrust::thrust::make_constant_iterator<int>(12);
+  const auto in_2_begin =
+    ::wrap_thrust::thrust::make_counting_iterator<int>(1024);
+
+  // Check that the qualifier resolves properly:
+  THRUST_NS_QUALIFIER::device_vector<int> d_out(n);
+
+  ::wrap_thrust::thrust::transform(in_1_begin,
+                                   in_1_begin + n,
+                                   in_2_begin,
+                                   d_out.begin(),
+                                   ::wrap_thrust::thrust::plus<>{});
+
+  ::wrap_thrust::thrust::host_vector<int> h_out(d_out);
+
+  for (std::size_t i = 0; i < n; ++i)
+  {
+    ASSERT_EQUAL(h_out[i], static_cast<int>(i) + 1024 + 12);
+  }
+}
+DECLARE_UNITTEST(TestWrappedNamespace);
diff --git a/testing/omp/CMakeLists.txt b/testing/omp/CMakeLists.txt
new file mode 100644
index 000000000..89ea9bb0c
--- /dev/null
+++ b/testing/omp/CMakeLists.txt
@@ -0,0 +1,18 @@
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  if (NOT config_device STREQUAL "OMP")
+    continue()
+  endif()
+
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "omp.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/backend/omp/nvcc_independence.cpp b/testing/omp/nvcc_independence.cpp
similarity index 100%
rename from testing/backend/omp/nvcc_independence.cpp
rename to testing/omp/nvcc_independence.cpp
diff --git a/testing/backend/omp/reduce_intervals.cu b/testing/omp/reduce_intervals.cu
similarity index 100%
rename from testing/backend/omp/reduce_intervals.cu
rename to testing/omp/reduce_intervals.cu
diff --git a/testing/out_of_memory_recovery.cu b/testing/out_of_memory_recovery.cu
new file mode 100644
index 000000000..5e4f0c327
--- /dev/null
+++ b/testing/out_of_memory_recovery.cu
@@ -0,0 +1,33 @@
+// Regression test for NVBug 2720132.
+//
+// Summary of 2720132:
+//
+// 1. The large allocation fails due to running out of memory.
+// 2. A `thrust::system::system_error` exception is thrown.
+// 3. Local objects are destroyed as the stack is unwound, leading to the destruction of `x`.
+// 4. `x` runs a parallel algorithm in its destructor to call the destructors of all of its elements.
+// 5. Launching that parallel algorithm fails because of the prior CUDA out of memory error.
+// 6. A `thrust::system::system_error` exception is thrown.
+// 7. Because we've already got an active exception, `terminate` is called.
+
+#include <unittest/unittest.h>
+#include <thrust/device_vector.h>
+#include <thrust/detail/cstdint.h>
+
+struct non_trivial
+{
+  __host__ __device__ non_trivial() {}
+  __host__ __device__ ~non_trivial() {}
+};
+
+void test_out_of_memory_recovery()
+{
+  try
+  {
+    thrust::device_vector<non_trivial> x(1);
+
+    thrust::device_vector<thrust::detail::uint32_t> y(0x00ffffffffffffff);
+  }
+  catch (...) { }
+}
+DECLARE_UNITTEST(test_out_of_memory_recovery);
diff --git a/testing/pair.cu b/testing/pair.cu
index d3a4efe93..f5f6e92b5 100644
--- a/testing/pair.cu
+++ b/testing/pair.cu
@@ -211,24 +211,44 @@ struct TestPairGet
     ASSERT_EQUAL(data[1], thrust::get<1>(p));
   }
 };
-SimpleUnitTest<TestPairGet, NumericTypes> TestPairGetInstance;
+SimpleUnitTest<TestPairGet, BuiltinNumericTypes> TestPairGetInstance;
 
+using PairConstVolatileTypes =
+    unittest::type_list<thrust::pair<int, float>, thrust::pair<int, float> const,
+                        thrust::pair<int, float> const volatile>;
 
-void TestPairTupleSize(void)
+template <typename Pair> 
+struct TestPairTupleSize
 {
-  int result = thrust::tuple_size< thrust::pair<int,int> >::value;
-  ASSERT_EQUAL(2, result);
+  void operator()()
+  {
+    ASSERT_EQUAL(2, static_cast<int>(thrust::tuple_size<Pair>::value));
+  }
 };
-DECLARE_UNITTEST(TestPairTupleSize);
+SimpleUnitTest<TestPairTupleSize, PairConstVolatileTypes> TestPairTupleSizeInstance;
 
 
 void TestPairTupleElement(void)
 {
-  typedef thrust::tuple_element<0, thrust::pair<int, float> >::type type0;
-  typedef thrust::tuple_element<1, thrust::pair<int, float> >::type type1;
-
-  ASSERT_EQUAL_QUIET(typeid(int),   typeid(type0));
-  ASSERT_EQUAL_QUIET(typeid(float), typeid(type1));
+  using type0 = thrust::tuple_element<0, thrust::pair<int, float> >::type;
+  using type1 = thrust::tuple_element<1, thrust::pair<int, float> >::type;
+  static_assert(std::is_same<int, type0>::value,"");
+  static_assert(std::is_same<float, type1>::value,"");
+
+  using c_type0 = thrust::tuple_element<0, thrust::pair<int, float> const>::type;
+  using c_type1 = thrust::tuple_element<1, thrust::pair<int, float> const>::type;
+  static_assert(std::is_same<int const, c_type0>::value,"");
+  static_assert(std::is_same<float const, c_type1>::value,"");
+
+  using v_type0 = thrust::tuple_element<0, thrust::pair<int, float> volatile>::type;
+  using v_type1 = thrust::tuple_element<1, thrust::pair<int, float> volatile>::type;
+  static_assert(std::is_same<int volatile, v_type0>::value,"");
+  static_assert(std::is_same<float volatile, v_type1>::value,"");
+
+  using cv_type0 = thrust::tuple_element<0, thrust::pair<int, float> const volatile>::type;
+  using cv_type1 = thrust::tuple_element<1, thrust::pair<int, float> const volatile>::type;
+  static_assert(std::is_same<int const volatile, cv_type0>::value,"");
+  static_assert(std::is_same<float const volatile, cv_type1>::value,"");
 };
 DECLARE_UNITTEST(TestPairTupleElement);
 
diff --git a/testing/pair_reduce.cu b/testing/pair_reduce.cu
index ebdab6597..6682fb3cc 100644
--- a/testing/pair_reduce.cu
+++ b/testing/pair_reduce.cu
@@ -20,7 +20,11 @@ struct add_pairs
   __host__ __device__
     Pair1 operator()(const Pair1 &x, const Pair2 &y)
   {
-    return thrust::make_pair(x.first + y.first, x.second + y.second);
+    // Need cast to undo integer promotion, decltype(char{} + char{}) == int
+    using P1T1 = typename Pair1::first_type;
+    using P1T2 = typename Pair1::second_type;
+    return thrust::make_pair(static_cast<P1T1>(x.first + y.first),
+                             static_cast<P1T2>(x.second + y.second));
   } // end operator()
 }; // end add_pairs
 
@@ -43,7 +47,7 @@ template <typename T>
     thrust::device_vector<T> d_p2 = h_p2;
     thrust::device_vector<P> d_pairs = h_pairs;
 
-    P init = thrust::make_pair(13,13);
+    P init = thrust::make_pair(T{13}, T{13});
 
     // reduce on the host
     P h_result = thrust::reduce(h_pairs.begin(), h_pairs.end(), init, add_pairs());
diff --git a/testing/pair_scan.cu b/testing/pair_scan.cu
index 2bebebed6..5554c6dc4 100644
--- a/testing/pair_scan.cu
+++ b/testing/pair_scan.cu
@@ -4,7 +4,7 @@
 #include <thrust/scan.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include <backend/cuda/testframework.h>
+#include <unittest/cuda/testframework.h>
 #endif
 
 struct make_pair_functor
@@ -61,19 +61,6 @@ template <typename T>
     thrust::inclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), thrust::maximum<P>());
     ASSERT_EQUAL_QUIET(h_output, d_output);
 
-
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     // scan with plus
     thrust::exclusive_scan(h_pairs.begin(), h_pairs.end(), h_output.begin(), init, add_pairs());
     thrust::exclusive_scan(d_pairs.begin(), d_pairs.end(), d_output.begin(), init, add_pairs());
diff --git a/testing/pair_scan_by_key.cu b/testing/pair_scan_by_key.cu
index 6e63bc806..21b53bcbe 100644
--- a/testing/pair_scan_by_key.cu
+++ b/testing/pair_scan_by_key.cu
@@ -20,7 +20,11 @@ struct add_pairs
   __host__ __device__
     Pair1 operator()(const Pair1 &x, const Pair2 &y)
   {
-    return thrust::make_pair(x.first + y.first, x.second + y.second);
+    // Need cast to undo integer promotion, decltype(char{} + char{}) == int
+    using P1T1 = typename Pair1::first_type;
+    using P1T2 = typename Pair1::second_type;
+    return thrust::make_pair(static_cast<P1T1>(x.first + y.first),
+                             static_cast<P1T2>(x.second + y.second));
   } // end operator()
 }; // end add_pairs
 
@@ -46,7 +50,7 @@ template <typename T>
     thrust::host_vector<T>   h_keys = unittest::random_integers<bool>(n);
     thrust::device_vector<T> d_keys = h_keys;
 
-    P init = thrust::make_pair(13,13);
+    P init = thrust::make_pair(T{13}, T{13});
 
     // scan on the host
     thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_pairs.begin(), h_pairs.begin(), init, thrust::equal_to<T>(), add_pairs());
diff --git a/testing/partition.cu b/testing/partition.cu
index 5ebb804e9..31aaa9fdd 100644
--- a/testing/partition.cu
+++ b/testing/partition.cu
@@ -6,6 +6,12 @@
 #include <thrust/iterator/retag.h>
 #include <thrust/sort.h>
 
+#if defined(THRUST_GCC_VERSION) && \
+  THRUST_GCC_VERSION >= 110000 && \
+  THRUST_GCC_VERSION < 120000
+#define WAIVE_GCC11_FAILURES
+#endif
+
 template<typename T>
 struct is_even
 {
@@ -21,6 +27,17 @@ void TestPartitionSimple(void)
     typedef typename Vector::value_type T;
     typedef typename Vector::iterator   Iterator;
 
+    // GCC 11 miscompiles and segfaults for certain versions of this test.
+    // It's not reproducible on other compilers, and the test passes when
+    // optimizations are disabled. It only affects 32-bit value types, and
+    // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+    if (sizeof(T) == 4)
+    {
+      return;
+    }
+#endif
+
     Vector data(5);
     data[0] = 1; 
     data[1] = 2; 
@@ -40,7 +57,7 @@ void TestPartitionSimple(void)
     ASSERT_EQUAL(iter - data.begin(), 2);
     ASSERT_EQUAL(data, ref);
 }
-DECLARE_VECTOR_UNITTEST(TestPartitionSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionSimple);
 
 template<typename Vector>
 void TestPartitionStencilSimple(void)
@@ -74,7 +91,7 @@ void TestPartitionStencilSimple(void)
     ASSERT_EQUAL(iter - data.begin(), 2);
     ASSERT_EQUAL(data, ref);
 }
-DECLARE_VECTOR_UNITTEST(TestPartitionStencilSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionStencilSimple);
 
 
 template<typename Vector>
@@ -109,7 +126,7 @@ void TestPartitionCopySimple(void)
     ASSERT_EQUAL(true_ref, true_results);
     ASSERT_EQUAL(false_ref, false_results);
 }
-DECLARE_VECTOR_UNITTEST(TestPartitionCopySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionCopySimple);
 
 
 template<typename Vector>
@@ -151,7 +168,7 @@ void TestPartitionCopyStencilSimple(void)
     ASSERT_EQUAL(true_ref, true_results);
     ASSERT_EQUAL(false_ref, false_results);
 }
-DECLARE_VECTOR_UNITTEST(TestPartitionCopyStencilSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionCopyStencilSimple);
 
 
 template<typename Vector>
@@ -179,7 +196,7 @@ void TestStablePartitionSimple(void)
     ASSERT_EQUAL(iter - data.begin(), 2);
     ASSERT_EQUAL(data, ref);
 }
-DECLARE_VECTOR_UNITTEST(TestStablePartitionSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStablePartitionSimple);
 
 
 template<typename Vector>
@@ -249,7 +266,7 @@ void TestStablePartitionCopySimple(void)
     ASSERT_EQUAL(true_ref, true_results);
     ASSERT_EQUAL(false_ref, false_results);
 }
-DECLARE_VECTOR_UNITTEST(TestStablePartitionCopySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStablePartitionCopySimple);
 
 
 template<typename Vector>
@@ -321,6 +338,17 @@ struct TestPartitionStencil
 {
     void operator()(const size_t n)
     {
+        // GCC 11 miscompiles and segfaults for certain versions of this test.
+        // It's not reproducible on other compilers, and the test passes when
+        // optimizations are disabled. It only affects 32-bit value types, and
+        // impacts all CPU host/device combinations tested.
+#ifdef WAIVE_GCC11_FAILURES
+        if (n == 0 && sizeof(T) == 4)
+        {
+          return;
+        }
+#endif
+
         // setup ranges
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::host_vector<T>   h_stencil = unittest::random_integers<T>(n);
@@ -349,8 +377,8 @@ struct TestPartitionCopy
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::device_vector<T> d_data = h_data;
         
-        size_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // setup output ranges
         thrust::host_vector<T>   h_true_results (n_true,  0);
@@ -393,8 +421,8 @@ struct TestPartitionCopyStencil
         thrust::device_vector<T> d_data = h_data;
         thrust::device_vector<T> d_stencil = h_stencil;
         
-        size_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // setup output ranges
         thrust::host_vector<T>   h_true_results (n_true,  0);
@@ -437,8 +465,8 @@ struct TestStablePartitionCopyStencil
         thrust::device_vector<T> d_data = h_data;
         thrust::device_vector<T> d_stencil = h_stencil;
         
-        size_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // setup output ranges
         thrust::host_vector<T>   h_true_results (n_true,  0);
@@ -479,8 +507,8 @@ struct TestPartitionCopyToDiscardIterator
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::device_vector<T> d_data = h_data;
         
-        size_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // mask both ranges
         thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
@@ -584,8 +612,8 @@ struct TestPartitionCopyStencilToDiscardIterator
         thrust::device_vector<T> d_data = h_data;
         thrust::device_vector<T> d_stencil = h_stencil;
         
-        size_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // mask both ranges
         thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
@@ -684,6 +712,9 @@ struct TestPartitionCopyStencilToDiscardIterator
 VariableUnitTest<TestPartitionCopyStencilToDiscardIterator, PartitionTypes> TestPartitionCopyStencilToDiscardIteratorInstance;
 
 
+// GCC 11 miscompiles and segfaults in this tests.
+#ifndef WAIVE_GCC11_FAILURES
+
 template <typename T>
 struct TestStablePartition
 {
@@ -702,6 +733,11 @@ struct TestStablePartition
 };
 VariableUnitTest<TestStablePartition, PartitionTypes> TestStablePartitionInstance;
 
+#endif // WAIVE_GCC11_FAILURES
+
+
+// GCC 11 miscompiles and segfaults in this tests.
+#ifndef WAIVE_GCC11_FAILURES
 
 template <typename T>
 struct TestStablePartitionStencil
@@ -723,6 +759,8 @@ struct TestStablePartitionStencil
 };
 VariableUnitTest<TestStablePartitionStencil, PartitionTypes> TestStablePartitionStencilInstance;
 
+#endif // WAIVE_GCC11_FAILURES
+
 
 template <typename T>
 struct TestStablePartitionCopy
@@ -733,8 +771,8 @@ struct TestStablePartitionCopy
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::device_vector<T> d_data = h_data;
         
-        size_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // setup output ranges
         thrust::host_vector<T>   h_true_results (n_true,  0);
@@ -771,8 +809,8 @@ struct TestStablePartitionCopyToDiscardIterator
         thrust::host_vector<T>   h_data = unittest::random_integers<T>(n);
         thrust::device_vector<T> d_data = h_data;
         
-        size_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_data.begin(), h_data.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // mask both ranges
         thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
@@ -876,8 +914,8 @@ struct TestStablePartitionCopyStencilToDiscardIterator
         thrust::device_vector<T> d_data = h_data;
         thrust::device_vector<T> d_stencil = h_stencil;
         
-        size_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
-        size_t n_false = n - n_true;
+        std::ptrdiff_t n_true  = thrust::count_if(h_stencil.begin(), h_stencil.end(), is_even<T>());
+        std::ptrdiff_t n_false = n - n_true;
 
         // mask both ranges
         thrust::pair<thrust::discard_iterator<>, thrust::discard_iterator<> > h_result1 =
@@ -990,8 +1028,6 @@ struct is_ordered
 template<typename Vector>
 void TestPartitionZipIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data1(5);
     Vector data2(5);
 
@@ -1029,8 +1065,6 @@ DECLARE_VECTOR_UNITTEST(TestPartitionZipIterator);
 template<typename Vector>
 void TestPartitionStencilZipIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data(5);
     data[0] = 1;
     data[1] = 0;
@@ -1072,8 +1106,6 @@ DECLARE_VECTOR_UNITTEST(TestPartitionStencilZipIterator);
 template<typename Vector>
 void TestStablePartitionZipIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data1(5);
     Vector data2(5);
 
@@ -1111,8 +1143,6 @@ DECLARE_VECTOR_UNITTEST(TestStablePartitionZipIterator);
 template<typename Vector>
 void TestStablePartitionStencilZipIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector data(5);
     data[0] = 1;
     data[1] = 0;
@@ -1155,8 +1185,8 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator partition(my_system &system,
                           ForwardIterator first,
-                          ForwardIterator last,
-                          Predicate pred)
+                          ForwardIterator,
+                          Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -1182,9 +1212,9 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator partition(my_system &system,
                           ForwardIterator first,
-                          ForwardIterator last,
-                          InputIterator stencil,
-                          Predicate pred)
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -1210,8 +1240,8 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator partition(my_tag,
                           ForwardIterator first,
-                          ForwardIterator last,
-                          Predicate pred)
+                          ForwardIterator,
+                          Predicate)
 {
     *first = 13;
     return first;
@@ -1235,9 +1265,9 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator partition(my_tag,
                           ForwardIterator first,
-                          ForwardIterator last,
-                          InputIterator stencil,
-                          Predicate pred)
+                          ForwardIterator,
+                          InputIterator,
+                          Predicate)
 {
     *first = 13;
     return first;
@@ -1262,11 +1292,11 @@ template<typename InputIterator,
          typename Predicate>
   thrust::pair<OutputIterator1,OutputIterator2>
     partition_copy(my_system &system,
-                   InputIterator first,
-                   InputIterator last,
+                   InputIterator,
+                   InputIterator,
                    OutputIterator1 out_true,
                    OutputIterator2 out_false,
-                   Predicate pred)
+                   Predicate)
 {
   system.validate_dispatch();
   return thrust::make_pair(out_true,out_false);
@@ -1296,12 +1326,12 @@ template<typename InputIterator1,
          typename Predicate>
   thrust::pair<OutputIterator1,OutputIterator2>
     partition_copy(my_system &system,
-                   InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
+                   InputIterator1,
+                   InputIterator1,
+                   InputIterator2,
                    OutputIterator1 out_true,
                    OutputIterator2 out_false,
-                   Predicate pred)
+                   Predicate)
 {
   system.validate_dispatch();
   return thrust::make_pair(out_true,out_false);
@@ -1332,10 +1362,10 @@ template<typename InputIterator,
   thrust::pair<OutputIterator1,OutputIterator2>
     partition_copy(my_tag,
                    InputIterator first,
-                   InputIterator last,
+                   InputIterator,
                    OutputIterator1 out_true,
                    OutputIterator2 out_false,
-                   Predicate pred)
+                   Predicate)
 {
   *first = 13;
   return thrust::make_pair(out_true,out_false);
@@ -1364,11 +1394,11 @@ template<typename InputIterator1,
   thrust::pair<OutputIterator1,OutputIterator2>
     partition_copy(my_tag,
                    InputIterator1 first,
-                   InputIterator1 last,
-                   InputIterator2 stencil,
+                   InputIterator1,
+                   InputIterator2,
                    OutputIterator1 out_true,
                    OutputIterator2 out_false,
-                   Predicate pred)
+                   Predicate)
 {
   *first = 13;
   return thrust::make_pair(out_true,out_false);
@@ -1394,8 +1424,8 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator stable_partition(my_system &system,
                                  ForwardIterator first,
-                                 ForwardIterator last,
-                                 Predicate pred)
+                                 ForwardIterator,
+                                 Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -1421,9 +1451,9 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator stable_partition(my_system &system,
                                  ForwardIterator first,
-                                 ForwardIterator last,
-                                 InputIterator stencil,
-                                 Predicate pred)
+                                 ForwardIterator,
+                                 InputIterator,
+                                 Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -1449,8 +1479,8 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator stable_partition(my_tag,
                                  ForwardIterator first,
-                                 ForwardIterator last,
-                                 Predicate pred)
+                                 ForwardIterator,
+                                 Predicate)
 {
     *first = 13;
     return first;
@@ -1474,9 +1504,9 @@ template<typename ForwardIterator,
          typename Predicate>
 ForwardIterator stable_partition(my_tag,
                                  ForwardIterator first,
-                                 ForwardIterator last,
-                                 InputIterator stencil,
-                                 Predicate pred)
+                                 ForwardIterator,
+                                 InputIterator,
+                                 Predicate)
 {
     *first = 13;
     return first;
@@ -1502,11 +1532,11 @@ template<typename InputIterator,
          typename Predicate>
   thrust::pair<OutputIterator1,OutputIterator2>
     stable_partition_copy(my_system &system,
-                          InputIterator first,
-                          InputIterator last,
+                          InputIterator,
+                          InputIterator,
                           OutputIterator1 out_true,
                           OutputIterator2 out_false,
-                          Predicate pred)
+                          Predicate)
 {
   system.validate_dispatch();
   return thrust::make_pair(out_true,out_false);
@@ -1536,12 +1566,12 @@ template<typename InputIterator1,
          typename Predicate>
   thrust::pair<OutputIterator1,OutputIterator2>
     stable_partition_copy(my_system &system,
-                          InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
+                          InputIterator1,
+                          InputIterator1,
+                          InputIterator2,
                           OutputIterator1 out_true,
                           OutputIterator2 out_false,
-                          Predicate pred)
+                          Predicate)
 {
   system.validate_dispatch();
   return thrust::make_pair(out_true,out_false);
@@ -1572,10 +1602,10 @@ template<typename InputIterator,
   thrust::pair<OutputIterator1,OutputIterator2>
     stable_partition_copy(my_tag,
                           InputIterator first,
-                          InputIterator last,
+                          InputIterator,
                           OutputIterator1 out_true,
                           OutputIterator2 out_false,
-                          Predicate pred)
+                          Predicate)
 {
   *first = 13;
   return thrust::make_pair(out_true,out_false);
@@ -1604,11 +1634,11 @@ template<typename InputIterator1,
   thrust::pair<OutputIterator1,OutputIterator2>
     stable_partition_copy(my_tag,
                           InputIterator1 first,
-                          InputIterator1 last,
-                          InputIterator2 stencil,
+                          InputIterator1,
+                          InputIterator2,
                           OutputIterator1 out_true,
                           OutputIterator2 out_false,
-                          Predicate pred)
+                          Predicate)
 {
   *first = 13;
   return thrust::make_pair(out_true,out_false);
diff --git a/testing/partition_point.cu b/testing/partition_point.cu
index e9fb72ddf..bd5a6a8c8 100644
--- a/testing/partition_point.cu
+++ b/testing/partition_point.cu
@@ -45,14 +45,14 @@ void TestPartitionPoint(void)
 
   ASSERT_EQUAL(ref - v.begin(), thrust::partition_point(v.begin(), v.end(), is_even<T>()) - v.begin());
 }
-DECLARE_VECTOR_UNITTEST(TestPartitionPoint);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPartitionPoint);
 
 
 template<typename ForwardIterator, typename Predicate>
 ForwardIterator partition_point(my_system &system, 
                                 ForwardIterator first,
-                                ForwardIterator last,
-                                Predicate pred)
+                                ForwardIterator,
+                                Predicate)
 {
   system.validate_dispatch();
   return first;
@@ -76,8 +76,8 @@ DECLARE_UNITTEST(TestPartitionPointDispatchExplicit);
 template<typename ForwardIterator, typename Predicate>
 ForwardIterator partition_point(my_tag,
                                 ForwardIterator first,
-                                ForwardIterator last,
-                                Predicate pred)
+                                ForwardIterator,
+                                Predicate)
 {
   *first = 13;
   return first;
@@ -95,3 +95,39 @@ void TestPartitionPointDispatchImplicit()
 }
 DECLARE_UNITTEST(TestPartitionPointDispatchImplicit);
 
+struct test_less_than
+{
+    long long expected;
+
+    __device__
+    bool operator()(long long y)
+    {
+        return y < expected;
+    }
+};
+
+void TestPartitionPointWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    test_less_than fn = { (1ll << magnitude) - 17 };
+
+    ASSERT_EQUAL(thrust::distance(
+        begin,
+        thrust::partition_point(
+            thrust::device,
+            begin, end,
+            fn)),
+        (1ll << magnitude) - 17);
+}
+
+void TestPartitionPointWithBigIndexes()
+{
+    TestPartitionPointWithBigIndexesHelper(30);
+    TestPartitionPointWithBigIndexesHelper(31);
+    TestPartitionPointWithBigIndexesHelper(32);
+    TestPartitionPointWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestPartitionPointWithBigIndexes);
diff --git a/testing/permutation_iterator.cu b/testing/permutation_iterator.cu
index 4fa32fd38..22fef650c 100644
--- a/testing/permutation_iterator.cu
+++ b/testing/permutation_iterator.cu
@@ -52,7 +52,7 @@ void TestPermutationIteratorSimple(void)
     ASSERT_EQUAL(source[6],  7);
     ASSERT_EQUAL(source[7],  8);
 }
-DECLARE_VECTOR_UNITTEST(TestPermutationIteratorSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorSimple);
 
 template <class Vector>
 void TestPermutationIteratorGather(void)
@@ -80,7 +80,7 @@ void TestPermutationIteratorGather(void)
     ASSERT_EQUAL(output[2], 6);
     ASSERT_EQUAL(output[3], 8);
 }
-DECLARE_VECTOR_UNITTEST(TestPermutationIteratorGather);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorGather);
 
 template <class Vector>
 void TestPermutationIteratorScatter(void)
@@ -113,13 +113,11 @@ void TestPermutationIteratorScatter(void)
     ASSERT_EQUAL(output[6],  7);
     ASSERT_EQUAL(output[7], 10);
 }
-DECLARE_VECTOR_UNITTEST(TestPermutationIteratorScatter);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorScatter);
 
 template <class Vector>
 void TestMakePermutationIterator(void)
 {
-    typedef typename Vector::iterator Iterator;
-
     Vector source(8);
     Vector indices(4);
     Vector output(4, 10);
@@ -141,7 +139,7 @@ void TestMakePermutationIterator(void)
     ASSERT_EQUAL(output[2], 6);
     ASSERT_EQUAL(output[3], 8);
 }
-DECLARE_VECTOR_UNITTEST(TestMakePermutationIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestMakePermutationIterator);
 
 template <typename Vector>
 void TestPermutationIteratorReduce(void)
@@ -176,7 +174,7 @@ void TestPermutationIteratorReduce(void)
                                          thrust::plus<T>());
     ASSERT_EQUAL(result2, -19);
 };
-DECLARE_VECTOR_UNITTEST(TestPermutationIteratorReduce);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorReduce);
 
 void TestPermutationIteratorHostDeviceGather(void)
 {
@@ -281,18 +279,20 @@ DECLARE_UNITTEST(TestPermutationIteratorHostDeviceScatter);
 template <typename Vector>
 void TestPermutationIteratorWithCountingIterator(void)
 {
-  typedef typename Vector::value_type T;
-  typedef typename Vector::iterator Iterator;
+  using T = typename Vector::value_type;
+  using diff_t = typename thrust::counting_iterator<T>::difference_type;
   
-  typename thrust::counting_iterator<T> input(0), index(0);
+  thrust::counting_iterator<T> input(0), index(0);
 
   // test copy()
   {
     Vector output(4,0);
 
-    thrust::copy(thrust::make_permutation_iterator(input, index),
-                 thrust::make_permutation_iterator(input, index + output.size()),
-                 output.begin());
+    auto first = thrust::make_permutation_iterator(input, index);
+    auto last  = thrust::make_permutation_iterator(input,
+                                                   index + static_cast<diff_t>(output.size()));
+
+    thrust::copy(first, last, output.begin());
 
     ASSERT_EQUAL(output[0], 0);
     ASSERT_EQUAL(output[1], 1);
@@ -315,5 +315,5 @@ void TestPermutationIteratorWithCountingIterator(void)
     ASSERT_EQUAL(output[3], 3);
   }
 }
-DECLARE_VECTOR_UNITTEST(TestPermutationIteratorWithCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestPermutationIteratorWithCountingIterator);
 
diff --git a/testing/preprocessor.cu b/testing/preprocessor.cu
new file mode 100644
index 000000000..643c9ad99
--- /dev/null
+++ b/testing/preprocessor.cu
@@ -0,0 +1,717 @@
+#include <unittest/unittest.h>
+#include <string>
+#include <thrust/detail/preprocessor.h>
+
+void test_pp_stringize()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(int))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello  world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE( hello  world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello  world ))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE( hello  world ))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(hello
+                                    world))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE("hello world"))
+  , "\"hello world\""
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE('hello world'))
+  , "'hello world'"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE($%!&<->))
+  , "$%!&<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE($%!&""<->))
+  , "$%!&\"\"<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_STRINGIZE))
+  , "THRUST_PP_STRINGIZE"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_STRINGIZE(int)))
+  , "\"int\""
+  );
+}
+DECLARE_UNITTEST(test_pp_stringize);
+
+void test_pp_cat2()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(i, nt)))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello, world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello , world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2( hello, world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello,  world)))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello, world )))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello,
+                                                   world )))
+  , "helloworld"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(hello world, from thrust!)))
+  , "hello worldfrom thrust!"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_CAT2(-, >)))
+  , "->"
+  );
+}
+DECLARE_UNITTEST(test_pp_cat2);
+
+#define THRUST_TEST_PP_EXPAND_TARGET() success
+
+#define THRUST_TEST_PP_EXPAND_ARGS() ()
+
+void test_pp_expand()
+{
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(int)))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello  world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND( hello  world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello  world )))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND( hello  world )))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(hello
+                                    world)))
+  , "hello world"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND("hello world")))
+  , "\"hello world\""
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND('hello world')))
+  , "'hello world'"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND($%!&<->)))
+  , "$%!&<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND($%!&""<->)))
+  , "$%!&\"\"<->"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(THRUST_PP_EXPAND)))
+  , "THRUST_PP_EXPAND"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(THRUST_PP_EXPAND(int))))
+  , "int"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(
+      THRUST_PP_CAT2(THRUST_TEST_, PP_EXPAND_TARGET)()
+    )))
+  , "success"
+  );
+
+  ASSERT_EQUAL(
+    std::string(THRUST_PP_STRINGIZE(THRUST_PP_EXPAND(
+      THRUST_TEST_PP_EXPAND_TARGET THRUST_TEST_PP_EXPAND_ARGS()
+    )))
+  , "success"
+  );
+}
+DECLARE_UNITTEST(test_pp_expand);
+
+#undef THRUST_TEST_PP_EXPAND_TARGET
+
+#undef THRUST_TEST_PP_EXPAND_ARGS
+
+void test_pp_arity()
+{
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY()
+  , 0
+  );
+
+  /* This bash script was used to generate these tests:
+
+    for arity in {0..62}
+    do
+      echo "  ASSERT_EQUAL("
+      echo "    THRUST_PP_ARITY("
+      echo "      `bash -c \"echo {0..${arity}} | tr ' ' ,\"`"
+      echo "    )"
+      echo "  , $((${arity} + 1))"
+      echo "  );"
+      echo
+    done
+  */
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0
+    )
+  , 1
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1
+    )
+  , 2
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2
+    )
+  , 3
+  );
+ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3
+    )
+  , 4
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4
+    )
+  , 5
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5
+    )
+  , 6
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6
+    )
+  , 7
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7
+    )
+  , 8
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8
+    )
+  , 9
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9
+    )
+  , 10
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10
+    )
+  , 11
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11
+    )
+  , 12
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12
+    )
+  , 13
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13
+    )
+  , 14
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+    )
+  , 15
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+    )
+  , 16
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
+    )
+  , 17
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
+    )
+  , 18
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
+    )
+  , 19
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
+    )
+  , 20
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
+    )
+  , 21
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
+    )
+  , 22
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22
+    )
+  , 23
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
+    )
+  , 24
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24
+    )
+  , 25
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
+    )
+  , 26
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
+    )
+  , 27
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
+    )
+  , 28
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
+    )
+  , 29
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
+    )
+  , 30
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30
+    )
+  , 31
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
+    )
+  , 32
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32
+    )
+  , 33
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33
+    )
+  , 34
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34
+    )
+  , 35
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35
+    )
+  , 36
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36
+    )
+  , 37
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37
+    )
+  , 38
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38
+    )
+  , 39
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39
+    )
+  , 40
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40
+    )
+  , 41
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41
+    )
+  , 42
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42
+    )
+  , 43
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43
+    )
+  , 44
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44
+    )
+  , 45
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45
+    )
+  , 46
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46
+    )
+  , 47
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47
+    )
+  , 48
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
+    )
+  , 49
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
+    )
+  , 50
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50
+    )
+  , 51
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51
+    )
+  , 52
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52
+    )
+  , 53
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53
+    )
+  , 54
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
+    )
+  , 55
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55
+    )
+  , 56
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56
+    )
+  , 57
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57
+    )
+  , 58
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
+    )
+  , 59
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59
+    )
+  , 60
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60
+    )
+  , 61
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61
+    )
+  , 62
+  );
+
+  ASSERT_EQUAL(
+    THRUST_PP_ARITY(
+      0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62
+    )
+  , 63
+  );
+}
+DECLARE_UNITTEST(test_pp_arity);
+
+#define THRUST_TEST_PP_DISPATCH_PLUS(...)                                     \
+  THRUST_PP_DISPATCH(THRUST_TEST_PP_DISPATCH_PLUS, __VA_ARGS__)               \
+  /**/
+#define THRUST_TEST_PP_DISPATCH_PLUS0()        0
+#define THRUST_TEST_PP_DISPATCH_PLUS1(x)       x
+#define THRUST_TEST_PP_DISPATCH_PLUS2(x, y)    x + y
+#define THRUST_TEST_PP_DISPATCH_PLUS3(x, y, z) x + y + z
+
+void test_pp_dispatch()
+{
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS()
+  , 0
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(0)
+  , 0
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(1, 2)
+  , 3
+  );
+
+  ASSERT_EQUAL(
+    THRUST_TEST_PP_DISPATCH_PLUS(1, 2, 3)
+  , 6
+  );
+}
+DECLARE_UNITTEST(test_pp_dispatch);
+
+#undef THRUST_TEST_PP_DISPATCH_PLUS
+#undef THRUST_TEST_PP_DISPATCH_PLUS0
+#undef THRUST_TEST_PP_DISPATCH_PLUS1
+#undef THRUST_TEST_PP_DISPATCH_PLUS2
+#undef THRUST_TEST_PP_DISPATCH_PLUS3
+
diff --git a/testing/random.cu b/testing/random.cu
index 564cfbd85..53a165055 100644
--- a/testing/random.cu
+++ b/testing/random.cu
@@ -450,12 +450,21 @@ void TestRanlux48BaseEqual(void)
 DECLARE_UNITTEST(TestRanlux48BaseEqual);
 
 
+#if defined(__INTEL_COMPILER) && 1800 >= __INTEL_COMPILER
+void TestRanlux48BaseUnequal(void)
+{
+    // ICPC has a known failure with this test.
+    // See nvbug 200414000.
+    KNOWN_FAILURE;
+}
+#else
 void TestRanlux48BaseUnequal(void)
 {
   typedef thrust::random::ranlux48_base Engine;
 
   TestEngineUnequal<Engine>();
 }
+#endif
 DECLARE_UNITTEST(TestRanlux48BaseUnequal);
 
 
@@ -757,19 +766,29 @@ template<typename Distribution, typename Validator>
     // test Distribution with same range as engine
 
     // test host
-    thrust::generate(h.begin(), h.end(), Validator(Distribution(Engine::min, Engine::max)));
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305)
+    thrust::generate(h.begin(), h.end(), Validator(
+        Distribution(Engine::min, Engine::max)
+    ));
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
 
     ASSERT_EQUAL(true, h[0]);
 
     // test device
-    thrust::generate(d.begin(), d.end(), Validator(Distribution(Engine::min, Engine::max)));
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305)
+    thrust::generate(d.begin(), d.end(), Validator(
+        Distribution(Engine::min, Engine::max)
+    ));
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
 
     ASSERT_EQUAL(true, d[0]);
 
     // test Distribution with smaller range than engine
 
     // test host
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4305) // Truncation warning.
     typename Distribution::result_type engine_range = Engine::max - Engine::min;
+    THRUST_DISABLE_MSVC_WARNING_END(4305)
     thrust::generate(h.begin(), h.end(), Validator(Distribution(engine_range/3, (2 * engine_range)/3)));
 
     ASSERT_EQUAL(true, h[0]);
diff --git a/testing/reduce.cu b/testing/reduce.cu
index 4594df2de..cb08bc889 100644
--- a/testing/reduce.cu
+++ b/testing/reduce.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 #include <thrust/reduce.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/retag.h>
 #include <limits>
 
@@ -8,7 +9,7 @@ template<typename T>
   struct plus_mod_10
 {
   __host__ __device__
-  T operator()(T rhs, T lhs) const
+  T operator()(T lhs, T rhs) const
   {
     return ((lhs % 10) + (rhs % 10)) % 10;
   }
@@ -112,7 +113,7 @@ void TestReduceMixedTypes(void)
 
     // float -> int should use using plus<int> operator by default
     ASSERT_EQUAL(thrust::reduce(float_input.begin(), float_input.end(), (int) 0), 10);
-    
+
     // int -> float should use using plus<float> operator by default
     ASSERT_EQUAL(thrust::reduce(int_input.begin(), int_input.end(), (float) 0.5), 10.5);
 }
@@ -185,27 +186,47 @@ void TestReduceWithIndirection(void)
     table[5] = 2;
 
     T result = thrust::reduce(data.begin(), data.end(), T(0), plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
-    
+
     ASSERT_EQUAL(result, T(1));
 }
-DECLARE_VECTOR_UNITTEST(TestReduceWithIndirection);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestReduceWithIndirection);
 
 template<typename T>
-  void TestReduceCountingIterator(size_t n)
+  void TestReduceCountingIterator()
 {
-  // be careful not to generate a range larger than we can represent
-  n = thrust::min<size_t>(n, std::numeric_limits<T>::max());
+  size_t const n = 15 * sizeof(T);
+
+  ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
 
   thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
   thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
-  
-  T init = 13;
-  
+
+  T init = unittest::random_integer<T>();
+
   T h_result = thrust::reduce(h_first, h_first + n, init);
   T d_result = thrust::reduce(d_first, d_first + n, init);
-  
+
   // we use ASSERT_ALMOST_EQUAL because we're testing floating point types
   ASSERT_ALMOST_EQUAL(h_result, d_result);
 }
-DECLARE_VARIABLE_UNITTEST(TestReduceCountingIterator);
+DECLARE_GENERIC_UNITTEST(TestReduceCountingIterator);
 
+void TestReduceWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    long long result = thrust::reduce(thrust::device, begin, end);
+
+    ASSERT_EQUAL(result, 1ll << magnitude);
+}
+
+void TestReduceWithBigIndexes()
+{
+    TestReduceWithBigIndexesHelper(30);
+    TestReduceWithBigIndexesHelper(31);
+    TestReduceWithBigIndexesHelper(32);
+    TestReduceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestReduceWithBigIndexes);
diff --git a/testing/reduce_by_key.cu b/testing/reduce_by_key.cu
index 53f889368..f8539c066 100644
--- a/testing/reduce_by_key.cu
+++ b/testing/reduce_by_key.cu
@@ -109,7 +109,7 @@ void TestReduceByKeySimple(void)
     ASSERT_EQUAL(output_values[3], 15);
     ASSERT_EQUAL(output_values[4], 15);
 }
-DECLARE_VECTOR_UNITTEST(TestReduceByKeySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestReduceByKeySimple);
 
 template<typename K>
 struct TestReduceByKey
@@ -172,14 +172,6 @@ struct TestReduceByKeyToDiscardIterator
         thrust::device_vector<K> d_keys_output(n);
         thrust::device_vector<V> d_vals_output(n);
 
-        typedef typename thrust::host_vector<K>::iterator   HostKeyIterator;
-        typedef typename thrust::host_vector<V>::iterator   HostValIterator;
-        typedef typename thrust::device_vector<K>::iterator DeviceKeyIterator;
-        typedef typename thrust::device_vector<V>::iterator DeviceValIterator;
-
-        typedef typename thrust::pair<HostKeyIterator,  HostValIterator>   HostIteratorPair;
-        typedef typename thrust::pair<DeviceKeyIterator,DeviceValIterator> DeviceIteratorPair;
-
         thrust::host_vector<K> unique_keys = h_keys;
         unique_keys.erase(thrust::unique(unique_keys.begin(), unique_keys.end()), unique_keys.end());
 
diff --git a/testing/reduce_large.cu b/testing/reduce_large.cu
index cfe2d0973..170895ccc 100644
--- a/testing/reduce_large.cu
+++ b/testing/reduce_large.cu
@@ -10,12 +10,14 @@ void _TestReduceWithLargeTypes(void)
     thrust::host_vector< FixedVector<T,N> > h_data(n);
 
     for(size_t i = 0; i < h_data.size(); i++)
-        h_data[i] = FixedVector<T,N>(i);
+    {
+      h_data[i] = FixedVector<T, N>(static_cast<T>(i));
+    }
 
     thrust::device_vector< FixedVector<T,N> > d_data = h_data;
     
-    FixedVector<T,N> h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector<T,N>(0));
-    FixedVector<T,N> d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector<T,N>(0));
+    FixedVector<T,N> h_result = thrust::reduce(h_data.begin(), h_data.end(), FixedVector<T,N>(T{0}));
+    FixedVector<T,N> d_result = thrust::reduce(d_data.begin(), d_data.end(), FixedVector<T,N>(T{0}));
 
     ASSERT_EQUAL_QUIET(h_result, d_result);
 }
diff --git a/testing/regression/CMakeLists.txt b/testing/regression/CMakeLists.txt
new file mode 100644
index 000000000..eea8b3a45
--- /dev/null
+++ b/testing/regression/CMakeLists.txt
@@ -0,0 +1,20 @@
+#
+# Disabled as these test names are too long for CMAKE_OBJECT_PATH_MAX.
+# We should integrate these with the other unit tests.
+# See issue #1205.
+#
+return()
+
+file(GLOB test_srcs
+  RELATIVE "${CMAKE_CURRENT_LIST_DIR}}"
+  CONFIGURE_DEPENDS
+  *.cu *.cpp
+)
+
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  foreach(test_src IN LISTS test_srcs)
+    get_filename_component(test_name "${test_src}" NAME_WLE)
+    string(PREPEND test_name "regression.")
+    thrust_add_test(test_target ${test_name} "${test_src}" ${thrust_target})
+  endforeach()
+endforeach()
diff --git a/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu b/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu
new file mode 100644
index 000000000..01308aa27
--- /dev/null
+++ b/testing/regression/gh_911__merge_by_key_wrong_element_type_default_comparator.cu
@@ -0,0 +1,26 @@
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+int main() {
+  const int N = 100;
+  thrust::device_ptr<int> input_key_A1;
+  thrust::device_ptr<float> input_val_A1;
+  thrust::device_ptr<int> input_key_B1;
+  thrust::device_ptr<float> input_val_B1;
+  thrust::device_ptr<int> output_key;
+  thrust::device_ptr<float> output_val;
+
+  // use key tuples (with one element to keep it simple)
+  auto input_key_tuple_A = thrust::make_tuple(input_key_A1);
+  auto input_key_tuple_B = thrust::make_tuple(input_key_B1);
+  auto output_key_tuple = thrust::make_tuple(output_key);
+  // use zip iterator to zip together elements of a tuple (each is an iterator)
+  auto zip_it_A = thrust::make_zip_iterator(input_key_tuple_A);
+  auto zip_it_B = thrust::make_zip_iterator(input_key_tuple_B);
+  auto zip_it_out = thrust::make_zip_iterator(output_key_tuple);
+
+  // does NOT compile in CUDA 9.1 (compiles fine in CUDA 8)
+  thrust::merge_by_key(zip_it_A, zip_it_A + N, zip_it_B, zip_it_B + N, input_val_A1, input_val_B1, zip_it_out, output_val);
+
+  return 0;
+}
+
diff --git a/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu b/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu
new file mode 100644
index 000000000..3904933f3
--- /dev/null
+++ b/testing/regression/gh_919_nvbug_2318871__zip_iterator_with_complex.cu
@@ -0,0 +1,40 @@
+#include <thrust/device_vector.h>
+#include <thrust/complex.h>
+#include <thrust/tuple.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+   
+struct greater_than_5 
+{
+  template <typename T>
+  __host__ __device__
+  bool operator()(T val)
+  {
+    return abs(val) > 5;
+  }
+};
+ 
+int main()
+{
+  typedef thrust::complex<float> T;
+
+  thrust::device_vector<T> d(10);
+  thrust::sequence(d.begin(), d.end());
+  thrust::device_vector<T> r(10);
+
+  thrust::counting_iterator<int> c_begin(0); 
+  thrust::counting_iterator<int> c_end(c_begin + 10); 
+
+  thrust::device_vector<int> idxs(10);
+
+  thrust::copy_if(
+    thrust::make_zip_iterator(thrust::make_tuple(c_begin, d.begin()))
+  , thrust::make_zip_iterator(thrust::make_tuple(c_end, d.end()))
+  , d.begin()
+  , thrust::make_zip_iterator(thrust::make_tuple(idxs.begin(), r.begin()))
+  , greater_than_5{}
+  );
+}
diff --git a/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu b/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu
new file mode 100644
index 000000000..ba422be60
--- /dev/null
+++ b/testing/regression/gh_928_nvbug_2341455__reduce_with_complex.cu
@@ -0,0 +1,10 @@
+#include <thrust/device_vector.h>
+#include <thrust/complex.h>
+#include <thrust/reduce.h>
+
+int main()
+{
+  thrust::device_vector<thrust::complex<double> > d(5);
+  thrust::reduce(d.begin(), d.end());
+}
+
diff --git a/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu b/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu
new file mode 100644
index 000000000..5e59633bb
--- /dev/null
+++ b/testing/regression/nvbug_1632709__reduce_large_input_sizes.cu
@@ -0,0 +1,20 @@
+#include <thrust/reduce.h> 
+#include <thrust/iterator/constant_iterator.h> 
+
+#include <assert.h>
+#include <iostream>
+ 
+int main()
+{ 
+  long long n = 10000000000; 
+
+  long long r = thrust::reduce(
+    thrust::constant_iterator<long long>(0)
+  , thrust::constant_iterator<long long>(n)
+  ); 
+
+  std::cout << r << std::endl;
+
+  assert(r == n);
+}
+ 
diff --git a/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu b/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu
new file mode 100644
index 000000000..646fdc558
--- /dev/null
+++ b/testing/regression/nvbug_1940974__merge_with_constant_iterator.cu
@@ -0,0 +1,35 @@
+#include <thrust/device_vector.h>
+#include <thrust/merge.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+struct comp
+{
+  template<typename Tuple1, typename Tuple2>
+  __host__ __device__
+  bool operator()(const Tuple1& t1, const Tuple2& t2) 
+  {
+    return thrust::get<0>(t1) == thrust::get<1>(t2);
+  }
+};
+
+int main()
+{
+    typedef thrust::device_vector<int> Vector;
+
+    Vector second(10), third(5), fourth(5), indices(15);
+
+    thrust::merge_by_key(thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<int>(12), second.begin())),
+                         thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<int>(12), second.begin())) + 10, 
+                         thrust::make_zip_iterator(thrust::make_tuple(third.begin(), fourth.begin())),
+                         thrust::make_zip_iterator(thrust::make_tuple(third.begin(), fourth.begin())) + 5,
+                         thrust::counting_iterator<int>(0),
+                         thrust::counting_iterator<int>(10),
+                         thrust::make_discard_iterator(),
+                         indices.begin(),
+                         comp());
+
+    return 0;
+}
+ 
diff --git a/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu b/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu
new file mode 100644
index 000000000..c01c0ad4e
--- /dev/null
+++ b/testing/regression/nvbug_1965743__unnecessary_static_on_get_occ_device_properties.cu
@@ -0,0 +1,5 @@
+// nvcc -Xcompiler -Wall -Xcompiler -Werror -ccbin=clang
+
+#include <thrust/system/cuda/detail/core/util.h>
+
+int main() {}
diff --git a/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu
new file mode 100644
index 000000000..f06945328
--- /dev/null
+++ b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.cu
@@ -0,0 +1,20 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{  
+  return make_uint2(a.x + b.x, a.y + b.y); 
+} 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero);  
+  
+  return 0;  
+}
+ 
diff --git a/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu
new file mode 100644
index 000000000..f987c2f3f
--- /dev/null
+++ b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed0.cu
@@ -0,0 +1,22 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+struct uint2_adder 
+{ 
+  __host__ __device__ uint2 operator()(uint2 a, uint2 b) {  
+    return make_uint2(a.x + b.x, a.y + b.y); 
+  } 
+}; 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero, uint2_adder());  
+  
+  return 0;  
+}
+ 
diff --git a/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu
new file mode 100644
index 000000000..4ccf67d39
--- /dev/null
+++ b/testing/regression/nvbug_1990211__scan_requires_assignability_from_zero.fixed1.cu
@@ -0,0 +1,20 @@
+#include <thrust/scan.h>
+#include <thrust/device_ptr.h>
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{  
+  return make_uint2(a.x + b.x, a.y + b.y); 
+} 
+  
+int main() {  
+  int num_elements = 32;  
+  uint2 *input = NULL, *output = NULL;
+  const uint2 zero = make_uint2(0,0);  
+  
+  thrust::exclusive_scan(thrust::device_ptr<uint2>((uint2*)input), 
+                         thrust::device_ptr<uint2>((uint2*)input + num_elements), 
+                         thrust::device_ptr<uint2>(output), zero, operator+);  
+  
+  return 0;  
+}
+ 
diff --git a/testing/remove.cu b/testing/remove.cu
index bdc7a8ccd..95b679dc7 100644
--- a/testing/remove.cu
+++ b/testing/remove.cu
@@ -30,14 +30,14 @@ void TestRemoveSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
-    typename Vector::iterator end = thrust::remove(data.begin(), 
-                                                    data.end(), 
+    typename Vector::iterator end = thrust::remove(data.begin(),
+                                                    data.end(),
                                                     (T) 2);
 
     ASSERT_EQUAL(end - data.begin(), 3);
@@ -102,17 +102,17 @@ void TestRemoveCopySimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy(data.begin(), 
-                                                        data.end(), 
-                                                        result.begin(), 
+    typename Vector::iterator end = thrust::remove_copy(data.begin(),
+                                                        data.end(),
+                                                        result.begin(),
                                                         (T) 2);
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -186,14 +186,14 @@ void TestRemoveIfSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
-    typename Vector::iterator end = thrust::remove_if(data.begin(), 
-                                                      data.end(), 
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
+                                                      data.end(),
                                                       is_even<T>());
 
     ASSERT_EQUAL(end - data.begin(), 3);
@@ -202,7 +202,7 @@ void TestRemoveIfSimple(void)
     ASSERT_EQUAL(data[1], 1);
     ASSERT_EQUAL(data[2], 3);
 }
-DECLARE_VECTOR_UNITTEST(TestRemoveIfSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestRemoveIfSimple);
 
 
 template<typename ForwardIterator,
@@ -210,7 +210,7 @@ template<typename ForwardIterator,
 ForwardIterator remove_if(my_system &system,
                           ForwardIterator first,
                           ForwardIterator,
-                          Predicate pred)
+                          Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -233,7 +233,7 @@ template<typename ForwardIterator,
 ForwardIterator remove_if(my_tag,
                           ForwardIterator first,
                           ForwardIterator,
-                          Predicate pred)
+                          Predicate)
 {
     *first = 13;
     return first;
@@ -258,11 +258,11 @@ void TestRemoveIfStencilSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector stencil(5);
     stencil[0] = 0;
@@ -271,7 +271,7 @@ void TestRemoveIfStencilSimple(void)
     stencil[3] = 0;
     stencil[4] = 1;
 
-    typename Vector::iterator end = thrust::remove_if(data.begin(), 
+    typename Vector::iterator end = thrust::remove_if(data.begin(),
                                                       data.end(),
                                                       stencil.begin(),
                                                       thrust::identity<T>());
@@ -292,7 +292,7 @@ ForwardIterator remove_if(my_system &system,
                           ForwardIterator first,
                           ForwardIterator,
                           InputIterator,
-                          Predicate pred)
+                          Predicate)
 {
     system.validate_dispatch();
     return first;
@@ -321,7 +321,7 @@ ForwardIterator remove_if(my_tag,
                           ForwardIterator first,
                           ForwardIterator,
                           InputIterator,
-                          Predicate pred)
+                          Predicate)
 {
     *first = 13;
     return first;
@@ -347,17 +347,17 @@ void TestRemoveCopyIfSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy_if(data.begin(), 
-                                                           data.end(), 
-                                                           result.begin(), 
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
+                                                           result.begin(),
                                                            is_even<T>());
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -366,7 +366,7 @@ void TestRemoveCopyIfSimple(void)
     ASSERT_EQUAL(result[1], 1);
     ASSERT_EQUAL(result[2], 3);
 }
-DECLARE_VECTOR_UNITTEST(TestRemoveCopyIfSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestRemoveCopyIfSimple);
 
 
 template<typename InputIterator,
@@ -431,11 +431,11 @@ void TestRemoveCopyIfStencilSimple(void)
     typedef typename Vector::value_type T;
 
     Vector data(5);
-    data[0] =  1; 
-    data[1] =  2; 
+    data[0] =  1;
+    data[1] =  2;
     data[2] =  1;
-    data[3] =  3; 
-    data[4] =  2; 
+    data[3] =  3;
+    data[4] =  2;
 
     Vector stencil(5);
     stencil[0] = 0;
@@ -446,10 +446,10 @@ void TestRemoveCopyIfStencilSimple(void)
 
     Vector result(5);
 
-    typename Vector::iterator end = thrust::remove_copy_if(data.begin(), 
-                                                           data.end(), 
+    typename Vector::iterator end = thrust::remove_copy_if(data.begin(),
+                                                           data.end(),
                                                            stencil.begin(),
-                                                           result.begin(), 
+                                                           result.begin(),
                                                            thrust::identity<T>());
 
     ASSERT_EQUAL(end - result.begin(), 3);
@@ -531,7 +531,7 @@ void TestRemove(const size_t n)
 
     size_t h_size = thrust::remove(h_data.begin(), h_data.end(), T(0)) - h_data.begin();
     size_t d_size = thrust::remove(d_data.begin(), d_data.end(), T(0)) - d_data.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -550,7 +550,7 @@ void TestRemoveIf(const size_t n)
 
     size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), is_true<T>()) - h_data.begin();
     size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), is_true<T>()) - d_data.begin();
-   
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -569,10 +569,10 @@ void TestRemoveIfStencil(const size_t n)
 
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
-    
+
     size_t h_size = thrust::remove_if(h_data.begin(), h_data.end(), h_stencil.begin(), is_true<T>()) - h_data.begin();
     size_t d_size = thrust::remove_if(d_data.begin(), d_data.end(), d_stencil.begin(), is_true<T>()) - d_data.begin();
-   
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_data.resize(h_size);
@@ -588,13 +588,13 @@ void TestRemoveCopy(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
 
     size_t h_size = thrust::remove_copy(h_data.begin(), h_data.end(), h_result.begin(), T(0)) - h_result.begin();
     size_t d_size = thrust::remove_copy(d_data.begin(), d_data.end(), d_result.begin(), T(0)) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -621,7 +621,7 @@ void TestRemoveCopyToDiscardIterator(const size_t n)
       thrust::remove_copy(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), T(0));
 
     thrust::discard_iterator<> reference(num_nonzeros);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
@@ -659,7 +659,7 @@ void TestRemoveCopyToDiscardIteratorZipped(const size_t n)
                           thrust::make_tuple(T(0),T(0)));
 
     thrust::discard_iterator<> reference(num_nonzeros);
-    
+
     ASSERT_EQUAL(h_output, d_output);
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple()));
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple()));
@@ -675,10 +675,10 @@ void TestRemoveCopyIf(const size_t n)
 
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
-    
+
     size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_result.begin(), is_true<T>()) - h_result.begin();
     size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_result.begin(), is_true<T>()) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -716,16 +716,16 @@ void TestRemoveCopyIfStencil(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
-    
+
     thrust::host_vector<T>   h_result(n);
     thrust::device_vector<T> d_result(n);
 
     size_t h_size = thrust::remove_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_result.begin(), is_true<T>()) - h_result.begin();
     size_t d_size = thrust::remove_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_result.begin(), is_true<T>()) - d_result.begin();
-    
+
     ASSERT_EQUAL(h_size, d_size);
 
     h_result.resize(h_size);
@@ -741,7 +741,7 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n)
 {
     thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
     thrust::device_vector<T> d_data = h_data;
-    
+
     thrust::host_vector<bool>   h_stencil = unittest::random_integers<bool>(n);
     thrust::device_vector<bool> d_stencil = h_stencil;
 
@@ -759,4 +759,3 @@ void TestRemoveCopyIfStencilToDiscardIterator(const size_t n)
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
 DECLARE_VARIABLE_UNITTEST(TestRemoveCopyIfStencilToDiscardIterator);
-
diff --git a/testing/replace.cu b/testing/replace.cu
index 1edbaafb1..9ba33ddde 100644
--- a/testing/replace.cu
+++ b/testing/replace.cu
@@ -33,7 +33,7 @@ DECLARE_VECTOR_UNITTEST(TestReplaceSimple);
 
 template<typename ForwardIterator, typename T>
 void replace(my_system &system,
-             ForwardIterator first, ForwardIterator, const T &,
+             ForwardIterator, ForwardIterator, const T &,
              const T &)
 {
     system.validate_dispatch();
@@ -256,7 +256,7 @@ DECLARE_VECTOR_UNITTEST(TestReplaceIfSimple);
 
 template<typename ForwardIterator, typename Predicate, typename T>
 void replace_if(my_system &system,
-                ForwardIterator first, ForwardIterator,
+                ForwardIterator, ForwardIterator,
                 Predicate,
                 const T &)
 {
@@ -337,7 +337,7 @@ DECLARE_VECTOR_UNITTEST(TestReplaceIfStencilSimple);
 
 template<typename ForwardIterator, typename InputIterator, typename Predicate, typename T>
 void replace_if(my_system &system,
-                ForwardIterator first, ForwardIterator,
+                ForwardIterator, ForwardIterator,
                 InputIterator,
                 Predicate,
                 const T &)
@@ -603,8 +603,8 @@ void TestReplaceCopyIf(const size_t n)
     thrust::host_vector<T>   h_dest(n);
     thrust::device_vector<T> d_dest(n);
 
-    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<T>(), 0);
-    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_dest.begin(), less_than_five<T>(), T{0});
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_dest.begin(), less_than_five<T>(), T{0});
 
     ASSERT_ALMOST_EQUAL(h_data, d_data);
     ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -619,10 +619,10 @@ void TestReplaceCopyIfToDiscardIterator(const size_t n)
     thrust::device_vector<T> d_data = h_data;
 
     thrust::discard_iterator<> h_result =
-      thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> d_result =
-      thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> reference(n);
 
@@ -643,8 +643,8 @@ void TestReplaceCopyIfStencil(const size_t n)
     thrust::host_vector<T>   h_dest(n);
     thrust::device_vector<T> d_dest(n);
 
-    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<T>(), 0);
-    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<T>(), 0);
+    thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), h_dest.begin(), less_than_five<T>(), T{0});
+    thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), d_dest.begin(), less_than_five<T>(), T{0});
 
     ASSERT_ALMOST_EQUAL(h_data, d_data);
     ASSERT_ALMOST_EQUAL(h_dest, d_dest);
@@ -661,10 +661,10 @@ void TestReplaceCopyIfStencilToDiscardIterator(const size_t n)
     thrust::device_vector<T> d_stencil = h_stencil;
 
     thrust::discard_iterator<> h_result =
-      thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(h_data.begin(), h_data.end(), h_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> d_result =
-      thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), 0);
+      thrust::replace_copy_if(d_data.begin(), d_data.end(), d_stencil.begin(), thrust::make_discard_iterator(), less_than_five<T>(), T{0});
 
     thrust::discard_iterator<> reference(n);
 
diff --git a/testing/reverse.cu b/testing/reverse.cu
index ea0cf5d29..1ea4b9b38 100644
--- a/testing/reverse.cu
+++ b/testing/reverse.cu
@@ -32,8 +32,8 @@ DECLARE_VECTOR_UNITTEST(TestReverseSimple);
 
 template<typename BidirectionalIterator>
 void reverse(my_system &system,
-             BidirectionalIterator first,
-             BidirectionalIterator last)
+             BidirectionalIterator,
+             BidirectionalIterator)
 {
   system.validate_dispatch();
 }
@@ -53,7 +53,7 @@ DECLARE_UNITTEST(TestReverseDispatchExplicit);
 template<typename BidirectionalIterator>
 void reverse(my_tag,
              BidirectionalIterator first,
-             BidirectionalIterator last)
+             BidirectionalIterator)
 {
   *first = 13;
 }
@@ -73,6 +73,16 @@ DECLARE_UNITTEST(TestReverseDispatchImplicit);
 template<typename Vector>
 void TestReverseCopySimple(void)
 {
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && \
+    THRUST_GCC_VERSION >= 80000 && THRUST_GCC_VERSION < 100000
+
+  if (typeid(Vector) == typeid(thrust::host_vector<custom_numeric>))
+  {
+    KNOWN_FAILURE // WAR NVBug 2481122
+  }
+
+#endif
+
   typedef typename Vector::iterator   Iterator;
 
   Vector input(5);
diff --git a/testing/scan.cu b/testing/scan.cu
index 50c53ce36..bceac4038 100644
--- a/testing/scan.cu
+++ b/testing/scan.cu
@@ -1,8 +1,14 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+
 #include <thrust/scan.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/retag.h>
+#include <thrust/device_malloc.h>
+#include <thrust/device_free.h>
 
 
 template<typename T>
@@ -20,6 +26,17 @@ template <class Vector>
 void TestScanSimple(void)
 {
     typedef typename Vector::value_type T;
+
+    // icc miscompiles the intermediate sum updates for custom_numeric.
+    // The issue doesn't happen with opts disabled, or on other compilers.
+    // Printing the intermediate sum each iteration "fixes" the issue,
+    // so likely a bad optimization.
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL
+    if (std::is_same<T, custom_numeric>::value)
+    {
+      return;
+    }
+#endif
     
     typename Vector::iterator iter;
 
@@ -34,35 +51,35 @@ void TestScanSimple(void)
     // inclusive scan
     iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin());
     result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
     // exclusive scan
-    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), 0);
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(0));
     result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
     // exclusive scan with init
-    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), 3);
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3));
     result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
     // inclusive scan with op
     iter = thrust::inclusive_scan(input.begin(), input.end(), output.begin(), thrust::plus<T>());
     result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
 
     // exclusive scan with init and op
-    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), 3, thrust::plus<T>());
+    iter = thrust::exclusive_scan(input.begin(), input.end(), output.begin(), T(3), thrust::plus<T>());
     result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
 
@@ -70,21 +87,21 @@ void TestScanSimple(void)
     input = input_copy;
     iter = thrust::inclusive_scan(input.begin(), input.end(), input.begin());
     result[0] = 1; result[1] = 4; result[2] = 2; result[3] = 6; result[4] = 1;
-    ASSERT_EQUAL(iter - input.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
     ASSERT_EQUAL(input, result);
 
     // inplace exclusive scan with init
     input = input_copy;
-    iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), 3);
+    iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin(), T(3));
     result[0] = 3; result[1] = 4; result[2] = 7; result[3] = 5; result[4] = 9;
-    ASSERT_EQUAL(iter - input.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
     ASSERT_EQUAL(input, result);
 
     // inplace exclusive scan with implicit init=0
     input = input_copy;
     iter = thrust::exclusive_scan(input.begin(), input.end(), input.begin());
     result[0] = 0; result[1] = 1; result[2] = 4; result[3] = 2; result[4] = 6;
-    ASSERT_EQUAL(iter - input.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
     ASSERT_EQUAL(input, result);
 }
 DECLARE_VECTOR_UNITTEST(TestScanSimple);
@@ -247,48 +264,49 @@ void TestScanMixedTypes(void)
 
     IntVector   int_output(4);
     FloatVector float_output(4);
-     
-    // float -> int should use using plus<int> operator by default
+
+    // float -> int should use plus<void> operator and float accumulator by default
     thrust::inclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
-    ASSERT_EQUAL(int_output[0],  1);
-    ASSERT_EQUAL(int_output[1],  3);
-    ASSERT_EQUAL(int_output[2],  6);
-    ASSERT_EQUAL(int_output[3], 10);
-    
-    // float -> float with plus<int> operator (int accumulator)
+    ASSERT_EQUAL(int_output[0],  1); // in: 1.5 accum: 1.5f out: 1
+    ASSERT_EQUAL(int_output[1],  4); // in: 2.5 accum: 4.0f out: 4
+    ASSERT_EQUAL(int_output[2],  7); // in: 3.5 accum: 7.5f out: 7
+    ASSERT_EQUAL(int_output[3], 12); // in: 4.5 accum: 12.f out: 12
+
+    // float -> float with plus<int> operator (float accumulator)
     thrust::inclusive_scan(float_input.begin(), float_input.end(), float_output.begin(), thrust::plus<int>());
-    ASSERT_EQUAL(float_output[0],  1.0);
-    ASSERT_EQUAL(float_output[1],  3.0);
-    ASSERT_EQUAL(float_output[2],  6.0);
-    ASSERT_EQUAL(float_output[3], 10.0);
-    
-    // float -> int should use using plus<int> operator by default
+    ASSERT_EQUAL(float_output[0],  1.5f); // in: 1.5 accum: 1.5f out: 1.5f
+    ASSERT_EQUAL(float_output[1],  3.0f); // in: 2.5 accum: 3.0f out: 3.0f
+    ASSERT_EQUAL(float_output[2],  6.0f); // in: 3.5 accum: 6.0f out: 6.0f
+    ASSERT_EQUAL(float_output[3], 10.0f); // in: 4.5 accum: 10.f out: 10.f
+
+    // float -> int should use plus<void> operator and float accumulator by default
     thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin());
-    ASSERT_EQUAL(int_output[0], 0);
-    ASSERT_EQUAL(int_output[1], 1);
-    ASSERT_EQUAL(int_output[2], 3);
-    ASSERT_EQUAL(int_output[3], 6);
-    
-    // float -> int should use using plus<int> operator by default
+    ASSERT_EQUAL(int_output[0], 0); // out: 0.0f  in: 1.5 accum: 1.5f
+    ASSERT_EQUAL(int_output[1], 1); // out: 1.5f  in: 2.5 accum: 4.0f
+    ASSERT_EQUAL(int_output[2], 4); // out: 4.0f  in: 3.5 accum: 7.5f
+    ASSERT_EQUAL(int_output[3], 7); // out: 7.5f  in: 4.5 accum: 12.f
+
+    // float -> int should use plus<> operator and float accumulator by default
     thrust::exclusive_scan(float_input.begin(), float_input.end(), int_output.begin(), (float) 5.5);
-    ASSERT_EQUAL(int_output[0],  5);
-    ASSERT_EQUAL(int_output[1],  7);
-    ASSERT_EQUAL(int_output[2],  9);
-    ASSERT_EQUAL(int_output[3], 13);
-    
-    // int -> float should use using plus<float> operator by default
+    ASSERT_EQUAL(int_output[0],  5); // out: 5.5f  in: 1.5 accum: 7.0f
+    ASSERT_EQUAL(int_output[1],  7); // out: 7.0f  in: 2.5 accum: 9.5f
+    ASSERT_EQUAL(int_output[2],  9); // out: 9.5f  in: 3.5 accum: 13.0f
+    ASSERT_EQUAL(int_output[3], 13); // out: 13.f  in: 4.5 accum: 17.4f
+
+    // int -> float should use using plus<> operator and int accumulator by default
     thrust::inclusive_scan(int_input.begin(), int_input.end(), float_output.begin());
-    ASSERT_EQUAL(float_output[0],  1.0);
-    ASSERT_EQUAL(float_output[1],  3.0);
-    ASSERT_EQUAL(float_output[2],  6.0);
-    ASSERT_EQUAL(float_output[3], 10.0);
-    
-    // int -> float should use using plus<float> operator by default
+    ASSERT_EQUAL(float_output[0],  1.f); // in: 1 accum: 1  out: 1
+    ASSERT_EQUAL(float_output[1],  3.f); // in: 2 accum: 3  out: 3
+    ASSERT_EQUAL(float_output[2],  6.f); // in: 3 accum: 6  out: 6
+    ASSERT_EQUAL(float_output[3], 10.f); // in: 4 accum: 10 out: 10
+
+    // int -> float + float init_value should use using plus<> operator and
+    // float accumulator by default
     thrust::exclusive_scan(int_input.begin(), int_input.end(), float_output.begin(), (float) 5.5);
-    ASSERT_EQUAL(float_output[0],  5.5);
-    ASSERT_EQUAL(float_output[1],  6.5);
-    ASSERT_EQUAL(float_output[2],  8.5);
-    ASSERT_EQUAL(float_output[3], 11.5);
+    ASSERT_EQUAL(float_output[0],  5.5f); // out: 5.5f  in: 1 accum: 6.5f
+    ASSERT_EQUAL(float_output[1],  6.5f); // out: 6.0f  in: 2 accum: 8.5f
+    ASSERT_EQUAL(float_output[2],  8.5f); // out: 8.0f  in: 3 accum: 11.5f
+    ASSERT_EQUAL(float_output[3], 11.5f); // out: 11.f  in: 4 accum: 15.5f
 }
 void TestScanMixedTypesHost(void)
 {
@@ -476,7 +494,9 @@ void _TestScanWithLargeTypes(void)
     thrust::host_vector< FixedVector<T,N> > h_output(n);
 
     for(size_t i = 0; i < h_input.size(); i++)
-        h_input[i] = FixedVector<T,N>(i);
+    {
+        h_input[i] = FixedVector<T, N>(static_cast<T>(i));
+    }
 
     thrust::device_vector< FixedVector<T,N> > d_input = h_input;
     thrust::device_vector< FixedVector<T,N> > d_output(n);
@@ -496,8 +516,7 @@ void TestScanWithLargeTypes(void)
 {
   _TestScanWithLargeTypes<int,  1>();
 
-  // XXX these are too big for sm_1x
-#if THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA
+#if !defined(__QNX__)
   _TestScanWithLargeTypes<int,  8>();
   _TestScanWithLargeTypes<int, 64>();
 #else
@@ -554,5 +573,167 @@ void TestInclusiveScanWithIndirection(void)
     ASSERT_EQUAL(data[5], T(0));
     ASSERT_EQUAL(data[6], T(1));
 }
-DECLARE_VECTOR_UNITTEST(TestInclusiveScanWithIndirection);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithIndirection);
+
+template <typename T>
+struct const_ref_plus_mod3
+{
+    T * table;
+
+    const_ref_plus_mod3(T * table) : table(table) {}
+
+    __host__ __device__
+    const T& operator()(T a, T b)
+    {
+        return table[(int) (a + b)];
+    }
+};
+
+template <typename Vector>
+void TestInclusiveScanWithConstAccumulator(void)
+{
+    // add numbers modulo 3 with external lookup table
+    typedef typename Vector::value_type T;
+
+    Vector data(7);
+    data[0] = 0;
+    data[1] = 1;
+    data[2] = 2;
+    data[3] = 1;
+    data[4] = 2;
+    data[5] = 0;
+    data[6] = 1;
+
+    Vector table(6);
+    table[0] = 0;
+    table[1] = 1;
+    table[2] = 2;
+    table[3] = 0;
+    table[4] = 1;
+    table[5] = 2;
+
+    thrust::inclusive_scan(data.begin(), data.end(), data.begin(), const_ref_plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
+    
+    ASSERT_EQUAL(data[0], T(0));
+    ASSERT_EQUAL(data[1], T(1));
+    ASSERT_EQUAL(data[2], T(0));
+    ASSERT_EQUAL(data[3], T(1));
+    ASSERT_EQUAL(data[4], T(0));
+    ASSERT_EQUAL(data[5], T(0));
+    ASSERT_EQUAL(data[6], T(1));
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestInclusiveScanWithConstAccumulator);
+
+struct only_set_when_expected_it
+{
+    long long expected;
+    bool * flag;
+
+    __host__ __device__ only_set_when_expected_it operator++() const { return *this; }
+    __host__ __device__ only_set_when_expected_it operator*() const { return *this; }
+    template<typename Difference>
+    __host__ __device__ only_set_when_expected_it operator+(Difference) const { return *this; }
+    template<typename Index>
+    __host__ __device__ only_set_when_expected_it operator[](Index) const { return *this; }
+
+    __device__
+    void operator=(long long value) const
+    {
+        if (value == expected)
+        {
+            *flag = true;
+        }
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+template<>
+struct iterator_traits<only_set_when_expected_it>
+{
+    typedef long long value_type;
+    typedef only_set_when_expected_it reference;
+};
+THRUST_NAMESPACE_END
+
+void TestInclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude), thrust::raw_pointer_cast(has_executed) };
+
+    thrust::inclusive_scan(thrust::device, begin, end, out);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestInclusiveScanWithBigIndexes()
+{
+  TestInclusiveScanWithBigIndexesHelper(30);
+  TestInclusiveScanWithBigIndexesHelper(31);
+  TestInclusiveScanWithBigIndexesHelper(32);
+  TestInclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestInclusiveScanWithBigIndexes);
+
+void TestExclusiveScanWithBigIndexesHelper(int magnitude)
+{
+    thrust::constant_iterator<long long> begin(1);
+    thrust::constant_iterator<long long> end = begin + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_ptr<bool> has_executed = thrust::device_malloc<bool>(1);
+    *has_executed = false;
+
+    only_set_when_expected_it out = { (1ll << magnitude) - 1, thrust::raw_pointer_cast(has_executed) };
+
+    thrust::exclusive_scan(thrust::device, begin, end, out,0ll);
+
+    bool has_executed_h = *has_executed;
+    thrust::device_free(has_executed);
+
+    ASSERT_EQUAL(has_executed_h, true);
+}
+
+void TestExclusiveScanWithBigIndexes()
+{
+  TestExclusiveScanWithBigIndexesHelper(30);
+  TestExclusiveScanWithBigIndexesHelper(31);
+  TestExclusiveScanWithBigIndexesHelper(32);
+  TestExclusiveScanWithBigIndexesHelper(33);
+}
+
+DECLARE_UNITTEST(TestExclusiveScanWithBigIndexes);
+
+#if THRUST_CPP_DIALECT >= 2011
+
+struct Int {
+    int i{};
+    __host__ __device__ explicit Int(int num) : i(num) {}
+    __host__ __device__ Int() : i{} {}
+    __host__ __device__ Int operator+(Int const& o) const { return Int{this->i + o.i}; }
+};
+
+void TestInclusiveScanWithUserDefinedType()
+{
+    thrust::device_vector<Int> vec(5, Int{1});
+
+    thrust::inclusive_scan(
+        thrust::device,
+        vec.cbegin(),
+        vec.cend(),
+        vec.begin());
+
+    ASSERT_EQUAL(static_cast<Int>(vec.back()).i, 5);
+}
+DECLARE_UNITTEST(TestInclusiveScanWithUserDefinedType);
 
+#endif // c++11
diff --git a/testing/scan_by_key.cu b/testing/scan_by_key.cu
deleted file mode 100644
index eb3d2e1ba..000000000
--- a/testing/scan_by_key.cu
+++ /dev/null
@@ -1,632 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/scan.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/retag.h>
-#include <thrust/random.h>
-
-
-template <typename Vector>
-void TestInclusiveScanByKeySimple(void)
-{
-    typedef typename Vector::value_type T;
-    typedef typename Vector::iterator   Iterator;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-
-    Iterator iter = thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL_QUIET(iter, output.end());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>(), thrust::multiplies<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  6);
-    ASSERT_EQUAL(output[3], 24);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 42);
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), thrust::equal_to<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-}
-DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator inclusive_scan_by_key(my_system &system,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    system.validate_dispatch();
-    return result;
-}
-
-void TestInclusiveScanByKeyDispatchExplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    my_system sys(0);
-    thrust::inclusive_scan_by_key(sys,
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin());
-
-    ASSERT_EQUAL(true, sys.is_valid());
-}
-DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator inclusive_scan_by_key(my_tag,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    *result = 13;
-    return result;
-}
-
-void TestInclusiveScanByKeyDispatchImplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    thrust::inclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()));
-
-    ASSERT_EQUAL(13, vec.front());
-}
-DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit);
-
-
-template <typename Vector>
-void TestExclusiveScanByKeySimple(void)
-{
-    typedef typename Vector::value_type T;
-    typedef typename Vector::iterator   Iterator;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-    
-    Iterator iter = thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL_QUIET(iter, output.end());
-
-    ASSERT_EQUAL(output[0], 0);
-    ASSERT_EQUAL(output[1], 0);
-    ASSERT_EQUAL(output[2], 2);
-    ASSERT_EQUAL(output[3], 5);
-    ASSERT_EQUAL(output[4], 0);
-    ASSERT_EQUAL(output[5], 0);
-    ASSERT_EQUAL(output[6], 6);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10));
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-    
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>(), thrust::multiplies<T>());
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 20);
-    ASSERT_EQUAL(output[3], 60);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 60);
-    
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), thrust::equal_to<T>());
-
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator exclusive_scan_by_key(my_system &system,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    system.validate_dispatch();
-    return result;
-}
-
-void TestExclusiveScanByKeyDispatchExplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    my_system sys(0);
-    thrust::exclusive_scan_by_key(sys,
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin(),
-                                  vec.begin());
-
-    ASSERT_EQUAL(true, sys.is_valid());
-}
-DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit);
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator>
-OutputIterator exclusive_scan_by_key(my_tag,
-                                     InputIterator1,
-                                     InputIterator1,
-                                     InputIterator2,
-                                     OutputIterator result)
-{
-    *result = 13;
-    return result;
-}
-
-void TestExclusiveScanByKeyDispatchImplicit()
-{
-    thrust::device_vector<int> vec(1);
-
-    thrust::exclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()),
-                                  thrust::retag<my_tag>(vec.begin()));
-
-    ASSERT_EQUAL(13, vec.front());
-}
-DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit);
-
-
-struct head_flag_predicate
-{
-    template <typename T>
-    __host__ __device__
-    bool operator()(const T& a, const T& b)
-    {
-        return b ? false : true;
-    }
-};
-
-template <typename Vector>
-void TestScanByKeyHeadFlags(void)
-{
-    typedef typename Vector::value_type T;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 0; vals[2] = 3;
-    keys[3] = 0; vals[3] = 4;
-    keys[4] = 1; vals[4] = 5;
-    keys[5] = 1; vals[5] = 6;
-    keys[6] = 0; vals[6] = 7;
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), head_flag_predicate(), thrust::plus<T>());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), T(10), head_flag_predicate(), thrust::plus<T>());
-    
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
-
-template <typename Vector>
-void TestInclusiveScanByKeyTransformIterator(void)
-{
-    typedef typename Vector::value_type T;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 2; vals[4] = 5;
-    keys[5] = 3; vals[5] = 6;
-    keys[6] = 3; vals[6] = 7;
-
-    thrust::inclusive_scan_by_key
-        (keys.begin(), keys.end(),
-         thrust::make_transform_iterator(vals.begin(), thrust::negate<T>()), 
-         output.begin());
-    
-    ASSERT_EQUAL(output[0],  -1);
-    ASSERT_EQUAL(output[1],  -2);
-    ASSERT_EQUAL(output[2],  -5);
-    ASSERT_EQUAL(output[3],  -9);
-    ASSERT_EQUAL(output[4],  -5);
-    ASSERT_EQUAL(output[5],  -6);
-    ASSERT_EQUAL(output[6], -13);
-}
-DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator);
-
-
-template <typename Vector>
-void TestScanByKeyReusedKeys(void)
-{
-    typedef typename Vector::value_type T;
-
-    Vector keys(7);
-    Vector vals(7);
-
-    Vector output(7, 0);
-
-    keys[0] = 0; vals[0] = 1;
-    keys[1] = 1; vals[1] = 2;
-    keys[2] = 1; vals[2] = 3;
-    keys[3] = 1; vals[3] = 4;
-    keys[4] = 0; vals[4] = 5;
-    keys[5] = 1; vals[5] = 6;
-    keys[6] = 1; vals[6] = 7;
-    
-    thrust::inclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin());
-
-    ASSERT_EQUAL(output[0],  1);
-    ASSERT_EQUAL(output[1],  2);
-    ASSERT_EQUAL(output[2],  5);
-    ASSERT_EQUAL(output[3],  9);
-    ASSERT_EQUAL(output[4],  5);
-    ASSERT_EQUAL(output[5],  6);
-    ASSERT_EQUAL(output[6], 13);
-
-    thrust::exclusive_scan_by_key(keys.begin(), keys.end(), vals.begin(), output.begin(), 10);
-    
-    ASSERT_EQUAL(output[0], 10);
-    ASSERT_EQUAL(output[1], 10);
-    ASSERT_EQUAL(output[2], 12);
-    ASSERT_EQUAL(output[3], 15);
-    ASSERT_EQUAL(output[4], 10);
-    ASSERT_EQUAL(output[5], 10);
-    ASSERT_EQUAL(output[6], 16);
-}
-DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
-
-
-template <typename T>
-void TestInclusiveScanByKey(const size_t n)
-{
-    // XXX WAR nvbug 1541533
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-    if(typeid(T) == typeid(char) ||
-       typeid(T) == typeid(unsigned char))
-    {
-      KNOWN_FAILURE;
-    }
-#endif
-
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey);
-
-
-template <typename T>
-void TestExclusiveScanByKey(const size_t n)
-{
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    // without init
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-    
-    // with init
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), (T) 11);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), (T) 11);
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey);
-
-
-template <typename T>
-void TestInclusiveScanByKeyInPlace(const size_t n)
-{
-    // XXX WAR nvbug 1541533
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-    if(typeid(T) == typeid(char) ||
-       typeid(T) == typeid(unsigned char))
-    {
-      KNOWN_FAILURE;
-    }
-#endif
-
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output(n);
-    thrust::device_vector<T> d_output(n);
-   
-    // in-place scans
-    h_output = h_vals;
-    d_output = d_vals;
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin());
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace);
-
-
-template <typename T>
-void TestExclusiveScanByKeyInPlace(const size_t n)
-{
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<T>   h_vals = unittest::random_integers<int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] = i % 10;
-    thrust::device_vector<T> d_vals = h_vals;
-
-    thrust::host_vector<T>   h_output = h_vals;
-    thrust::device_vector<T> d_output = d_vals;
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_output.begin(), h_output.begin(), (T) 11);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_output.begin(), d_output.begin(), (T) 11);
-    ASSERT_EQUAL(d_output, h_output);
-}
-DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace);
-
-
-void TestScanByKeyMixedTypes(void)
-{
-    const unsigned int n = 113;
-    
-    thrust::host_vector<int> h_keys(n);
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < n; i++){
-        h_keys[i] = k;
-        if (rng() % 10 == 0)
-            k++;
-    }
-    thrust::device_vector<int> d_keys = h_keys;
-
-    thrust::host_vector<unsigned int> h_vals = unittest::random_integers<unsigned int>(n);
-    for(size_t i = 0; i < n; i++)
-        h_vals[i] %= 10;
-    thrust::device_vector<unsigned int> d_vals = h_vals;
-
-    thrust::host_vector<float>   h_float_output(n);
-    thrust::device_vector<float> d_float_output(n);
-    thrust::host_vector<int>   h_int_output(n);
-    thrust::device_vector<int> d_int_output(n);
-
-    //mixed vals/output types
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin());
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (float) 3.5);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (float) 3.5);
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_float_output.begin(), (int) 3);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_float_output.begin(), (int) 3);
-    ASSERT_EQUAL(d_float_output, h_float_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (int) 3);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (int) 3);
-    ASSERT_EQUAL(d_int_output, h_int_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_int_output.begin(), (float) 3.5);
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_int_output.begin(), (float) 3.5);
-    ASSERT_EQUAL(d_int_output, h_int_output);
-}
-DECLARE_UNITTEST(TestScanByKeyMixedTypes);
-
-
-void TestScanByKeyLargeInput()
-{
-    typedef int T;
-    const unsigned int N = 1 << 20;
-
-    thrust::host_vector<unsigned int> vals_sizes = unittest::random_integers<unsigned int>(10);
-        
-    thrust::host_vector<unsigned int>   h_vals = unittest::random_integers<unsigned int>(N);
-    thrust::device_vector<unsigned int> d_vals = h_vals;
-
-    thrust::host_vector<unsigned int>   h_output(N, 0);
-    thrust::device_vector<unsigned int> d_output(N, 0);
-
-    for (unsigned int i = 0; i < vals_sizes.size(); i++)
-    {
-        const unsigned int n = vals_sizes[i] % N;
-
-        // define segments
-        thrust::host_vector<unsigned int> h_keys(n);
-        thrust::default_random_engine rng;
-        for(size_t i = 0, k = 0; i < n; i++){
-            h_keys[i] = k;
-            if (rng() % 100 == 0)
-                k++;
-        }
-        thrust::device_vector<unsigned int> d_keys = h_keys;
-    
-        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
-        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
-        ASSERT_EQUAL(d_output, h_output);
-
-        thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.begin() + n, h_vals.begin(), h_output.begin());
-        thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.begin() + n, d_vals.begin(), d_output.begin());
-        ASSERT_EQUAL(d_output, h_output);
-   }
-}
-DECLARE_UNITTEST(TestScanByKeyLargeInput);
-
-
-template <typename T, unsigned int N>
-void _TestScanByKeyWithLargeTypes(void)
-{
-    size_t n = (64 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector<   unsigned int   > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-    thrust::host_vector< FixedVector<T,N> > h_output(n);
-
-    thrust::default_random_engine rng;
-    for(size_t i = 0, k = 0; i < h_vals.size(); i++)
-    {
-        h_vals[i] = FixedVector<T,N>(i);
-        h_keys[i]  = k;
-        if (rng() % 5 == 0)
-            k++;
-    }
-
-    thrust::device_vector<   unsigned int   > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    thrust::device_vector< FixedVector<T,N> > d_output(n);
-    
-    thrust::inclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin());
-    thrust::inclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin());
-
-    ASSERT_EQUAL_QUIET(h_output, d_output);
-    
-    thrust::exclusive_scan_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), h_output.begin(), FixedVector<T,N>(0));
-    thrust::exclusive_scan_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), d_output.begin(), FixedVector<T,N>(0));
-    
-    ASSERT_EQUAL_QUIET(h_output, d_output);
-}
-
-void TestScanByKeyWithLargeTypes(void)
-{
-    _TestScanByKeyWithLargeTypes<int,    1>();
-    _TestScanByKeyWithLargeTypes<int,    2>();
-    _TestScanByKeyWithLargeTypes<int,    4>();
-    _TestScanByKeyWithLargeTypes<int,    8>();
-    //_TestScanByKeyWithLargeTypes<int,   16>();  // too many resources requested for launch
-    //_TestScanByKeyWithLargeTypes<int,   32>();  
-    //_TestScanByKeyWithLargeTypes<int,   64>();  // too large to pass as argument
-    //_TestScanByKeyWithLargeTypes<int,  128>();
-    //_TestScanByKeyWithLargeTypes<int,  256>();
-    //_TestScanByKeyWithLargeTypes<int,  512>();
-    //_TestScanByKeyWithLargeTypes<int, 1024>();
-}
-DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
-
diff --git a/testing/scan_by_key.exclusive.cu b/testing/scan_by_key.exclusive.cu
new file mode 100644
index 000000000..58354d848
--- /dev/null
+++ b/testing/scan_by_key.exclusive.cu
@@ -0,0 +1,576 @@
+#include <thrust/scan.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/random.h>
+
+#include <unittest/unittest.h>
+
+
+template <typename Vector>
+void TestExclusiveScanByKeySimple()
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  Iterator iter = thrust::exclusive_scan_by_key(keys.begin(),
+                                                keys.end(),
+                                                vals.begin(),
+                                                output.begin());
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0], 0);
+  ASSERT_EQUAL(output[1], 0);
+  ASSERT_EQUAL(output[2], 2);
+  ASSERT_EQUAL(output[3], 5);
+  ASSERT_EQUAL(output[4], 0);
+  ASSERT_EQUAL(output[5], 0);
+  ASSERT_EQUAL(output[6], 6);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10));
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                thrust::equal_to<T>(),
+                                thrust::multiplies<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 20);
+  ASSERT_EQUAL(output[3], 60);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 60);
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                thrust::equal_to<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestExclusiveScanByKeySimple);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_system& system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+
+void TestExclusiveScanByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::exclusive_scan_by_key(sys,
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchExplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator exclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+
+void TestExclusiveScanByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::exclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestExclusiveScanByKeyDispatchImplicit);
+
+
+struct head_flag_predicate
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T&, const T& b)
+  {
+    return b ? false : true;
+  }
+};
+
+
+template <typename Vector>
+void TestScanByKeyHeadFlags()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 0; vals[2] = 3;
+  keys[3] = 0; vals[3] = 4;
+  keys[4] = 1; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 0; vals[6] = 7;
+  // clang-format on
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                T(10),
+                                head_flag_predicate(),
+                                thrust::plus<T>());
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
+
+
+template <typename Vector>
+void TestScanByKeyReusedKeys()
+{
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 0; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 1; vals[6] = 7;
+  // clang-format on
+
+  thrust::exclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                typename Vector::value_type(10));
+
+  ASSERT_EQUAL(output[0], 10);
+  ASSERT_EQUAL(output[1], 10);
+  ASSERT_EQUAL(output[2], 12);
+  ASSERT_EQUAL(output[3], 15);
+  ASSERT_EQUAL(output[4], 10);
+  ASSERT_EQUAL(output[5], 10);
+  ASSERT_EQUAL(output[6], 16);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
+
+
+template <typename T>
+void TestExclusiveScanByKey(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  // without init
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+
+  // with init
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKey);
+
+
+template <typename T>
+void TestExclusiveScanByKeyInPlace(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  // in-place scans: in/out values aliasing
+  thrust::host_vector<T> h_output   = h_vals;
+  thrust::device_vector<T> d_output = d_vals;
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_output.begin(),
+                                h_output.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_output.begin(),
+                                d_output.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: in/out keys aliasing
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_keys.begin(),
+                                (T)11);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_keys.begin(),
+                                (T)11);
+  ASSERT_EQUAL(d_keys, h_keys);
+}
+DECLARE_VARIABLE_UNITTEST(TestExclusiveScanByKeyInPlace);
+
+
+void TestScanByKeyMixedTypes()
+{
+  const unsigned int n = 113;
+
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] %= 10;
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<float> h_float_output(n);
+  thrust::device_vector<float> d_float_output(n);
+  thrust::host_vector<int> h_int_output(n);
+  thrust::device_vector<int> d_int_output(n);
+
+  // mixed vals/output types
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin(),
+                                (float)3.5);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin(),
+                                (float)3.5);
+  ASSERT_EQUAL(d_float_output, h_float_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin(),
+                                (int)3);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin(),
+                                (int)3);
+  ASSERT_EQUAL(d_float_output, h_float_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_int_output.begin(),
+                                (int)3);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_int_output.begin(),
+                                (int)3);
+  ASSERT_EQUAL(d_int_output, h_int_output);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_int_output.begin(),
+                                (float)3.5);
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_int_output.begin(),
+                                (float)3.5);
+  ASSERT_EQUAL(d_int_output, h_int_output);
+}
+DECLARE_UNITTEST(TestScanByKeyMixedTypes);
+
+
+template <typename T>
+void TestScanByKeyDiscardOutput(std::size_t n)
+{
+  thrust::host_vector<T> h_keys(n);
+  thrust::default_random_engine rng;
+
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<T>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<T> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<T>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  auto out = thrust::make_discard_iterator();
+
+  // These are no-ops, but they should compile.
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out);
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{});
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{},
+                                thrust::equal_to<T>{});
+  thrust::exclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                T{},
+                                thrust::equal_to<T>{},
+                                thrust::multiplies<T>{});
+}
+DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput);
+
+
+void TestScanByKeyLargeInput()
+{
+  const unsigned int N = 1 << 20;
+
+  thrust::host_vector<unsigned int> vals_sizes =
+    unittest::random_integers<unsigned int>(10);
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(N);
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<unsigned int> h_output(N, 0);
+  thrust::device_vector<unsigned int> d_output(N, 0);
+
+  for (unsigned int i = 0; i < vals_sizes.size(); i++)
+  {
+    const unsigned int n = vals_sizes[i] % N;
+
+    // define segments
+    thrust::host_vector<unsigned int> h_keys(n);
+    thrust::default_random_engine rng;
+    for (size_t j = 0, k = 0; j < n; j++)
+    {
+      h_keys[j] = static_cast<unsigned int>(k);
+      if (rng() % 100 == 0)
+      {
+        k++;
+      }
+    }
+    thrust::device_vector<unsigned int> d_keys = h_keys;
+
+    thrust::exclusive_scan_by_key(h_keys.begin(),
+                                  h_keys.begin() + n,
+                                  h_vals.begin(),
+                                  h_output.begin());
+    thrust::exclusive_scan_by_key(d_keys.begin(),
+                                  d_keys.begin() + n,
+                                  d_vals.begin(),
+                                  d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+}
+DECLARE_UNITTEST(TestScanByKeyLargeInput);
+
+
+template <typename T, unsigned int N>
+void _TestScanByKeyWithLargeTypes()
+{
+  size_t n = (64 * 1024) / sizeof(FixedVector<T, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<T, N>> h_vals(n);
+  thrust::host_vector<FixedVector<T, N>> h_output(n);
+
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < h_vals.size(); i++)
+  {
+    h_keys[i] = static_cast<unsigned int>(k);
+    h_vals[i] = FixedVector<T, N>(static_cast<T>(i));
+    if (rng() % 5 == 0)
+    {
+      k++;
+    }
+  }
+
+  thrust::device_vector<unsigned int> d_keys      = h_keys;
+  thrust::device_vector<FixedVector<T, N>> d_vals = h_vals;
+  thrust::device_vector<FixedVector<T, N>> d_output(n);
+
+  thrust::exclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin(),
+                                FixedVector<T, N>(0));
+  thrust::exclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin(),
+                                FixedVector<T, N>(0));
+
+  ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+
+void TestScanByKeyWithLargeTypes()
+{
+  _TestScanByKeyWithLargeTypes<int, 1>();
+  _TestScanByKeyWithLargeTypes<int, 2>();
+  _TestScanByKeyWithLargeTypes<int, 4>();
+  _TestScanByKeyWithLargeTypes<int, 8>();
+
+  // too many resources requested for launch:
+  //_TestScanByKeyWithLargeTypes<int,   16>();
+  //_TestScanByKeyWithLargeTypes<int,   32>();
+
+  // too large to pass as argument:
+  //_TestScanByKeyWithLargeTypes<int,   64>();
+  //_TestScanByKeyWithLargeTypes<int,  128>();
+  //_TestScanByKeyWithLargeTypes<int,  256>();
+  //_TestScanByKeyWithLargeTypes<int,  512>();
+  //_TestScanByKeyWithLargeTypes<int, 1024>();
+}
+DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
diff --git a/testing/scan_by_key.inclusive.cu b/testing/scan_by_key.inclusive.cu
new file mode 100644
index 000000000..b2d2337e2
--- /dev/null
+++ b/testing/scan_by_key.inclusive.cu
@@ -0,0 +1,524 @@
+#include <thrust/scan.h>
+
+#include <thrust/functional.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/retag.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/random.h>
+
+#include <unittest/unittest.h>
+
+template <typename Vector>
+void TestInclusiveScanByKeySimple()
+{
+  typedef typename Vector::value_type T;
+  typedef typename Vector::iterator Iterator;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  Iterator iter = thrust::inclusive_scan_by_key(keys.begin(),
+                                                keys.end(),
+                                                vals.begin(),
+                                                output.begin());
+
+  ASSERT_EQUAL_QUIET(iter, output.end());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                thrust::equal_to<T>(),
+                                thrust::multiplies<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 6);
+  ASSERT_EQUAL(output[3], 24);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 42);
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                thrust::equal_to<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeySimple);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_system& system,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  system.validate_dispatch();
+  return result;
+}
+
+void TestInclusiveScanByKeyDispatchExplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  my_system sys(0);
+  thrust::inclusive_scan_by_key(sys,
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin(),
+                                vec.begin());
+
+  ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchExplicit);
+
+
+template <typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator>
+OutputIterator inclusive_scan_by_key(my_tag,
+                                     InputIterator1,
+                                     InputIterator1,
+                                     InputIterator2,
+                                     OutputIterator result)
+{
+  *result = 13;
+  return result;
+}
+
+void TestInclusiveScanByKeyDispatchImplicit()
+{
+  thrust::device_vector<int> vec(1);
+
+  thrust::inclusive_scan_by_key(thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()),
+                                thrust::retag<my_tag>(vec.begin()));
+
+  ASSERT_EQUAL(13, vec.front());
+}
+DECLARE_UNITTEST(TestInclusiveScanByKeyDispatchImplicit);
+
+struct head_flag_predicate
+{
+  template <typename T>
+  __host__ __device__ bool operator()(const T&, const T& b)
+  {
+    return b ? false : true;
+  }
+};
+
+template <typename Vector>
+void TestScanByKeyHeadFlags()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 0; vals[2] = 3;
+  keys[3] = 0; vals[3] = 4;
+  keys[4] = 1; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 0; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin(),
+                                head_flag_predicate(),
+                                thrust::plus<T>());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyHeadFlags);
+
+template <typename Vector>
+void TestInclusiveScanByKeyTransformIterator()
+{
+  typedef typename Vector::value_type T;
+
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 2; vals[4] = 5;
+  keys[5] = 3; vals[5] = 6;
+  keys[6] = 3; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(
+    keys.begin(),
+    keys.end(),
+    thrust::make_transform_iterator(vals.begin(), thrust::negate<T>()),
+    output.begin());
+
+  ASSERT_EQUAL(output[0], -1);
+  ASSERT_EQUAL(output[1], -2);
+  ASSERT_EQUAL(output[2], -5);
+  ASSERT_EQUAL(output[3], -9);
+  ASSERT_EQUAL(output[4], -5);
+  ASSERT_EQUAL(output[5], -6);
+  ASSERT_EQUAL(output[6], -13);
+}
+DECLARE_VECTOR_UNITTEST(TestInclusiveScanByKeyTransformIterator);
+
+
+template <typename Vector>
+void TestScanByKeyReusedKeys()
+{
+  Vector keys(7);
+  Vector vals(7);
+
+  Vector output(7, 0);
+
+  // clang-format off
+  keys[0] = 0; vals[0] = 1;
+  keys[1] = 1; vals[1] = 2;
+  keys[2] = 1; vals[2] = 3;
+  keys[3] = 1; vals[3] = 4;
+  keys[4] = 0; vals[4] = 5;
+  keys[5] = 1; vals[5] = 6;
+  keys[6] = 1; vals[6] = 7;
+  // clang-format on
+
+  thrust::inclusive_scan_by_key(keys.begin(),
+                                keys.end(),
+                                vals.begin(),
+                                output.begin());
+
+  ASSERT_EQUAL(output[0], 1);
+  ASSERT_EQUAL(output[1], 2);
+  ASSERT_EQUAL(output[2], 5);
+  ASSERT_EQUAL(output[3], 9);
+  ASSERT_EQUAL(output[4], 5);
+  ASSERT_EQUAL(output[5], 6);
+  ASSERT_EQUAL(output[6], 13);
+}
+DECLARE_VECTOR_UNITTEST(TestScanByKeyReusedKeys);
+
+
+template <typename T>
+void TestInclusiveScanByKey(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] = static_cast<int>(i % 10);
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKey);
+
+
+template <typename T>
+void TestInclusiveScanByKeyInPlace(const size_t n)
+{
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals = unittest::random_integers<int>(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<int>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  thrust::host_vector<T> h_output(n);
+  thrust::device_vector<T> d_output(n);
+
+  // in-place scans: in/out values aliasing
+  h_output = h_vals;
+  d_output = d_vals;
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_output.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_output.begin(),
+                                d_output.begin());
+  ASSERT_EQUAL(d_output, h_output);
+
+  // in-place scans: in/out keys aliasing
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_keys.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_keys.begin());
+  ASSERT_EQUAL(d_keys, h_keys);
+}
+DECLARE_VARIABLE_UNITTEST(TestInclusiveScanByKeyInPlace);
+
+
+void TestScanByKeyMixedTypes()
+{
+  const unsigned int n = 113;
+
+  thrust::host_vector<int> h_keys(n);
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<int>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<int> d_keys = h_keys;
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(n);
+  for (size_t i = 0; i < n; i++)
+    h_vals[i] %= 10;
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<float> h_float_output(n);
+  thrust::device_vector<float> d_float_output(n);
+  thrust::host_vector<int> h_int_output(n);
+  thrust::device_vector<int> d_int_output(n);
+
+  // mixed vals/output types
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_float_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_float_output.begin());
+  ASSERT_EQUAL(d_float_output, h_float_output);
+}
+DECLARE_UNITTEST(TestScanByKeyMixedTypes);
+
+
+template <typename T>
+void TestScanByKeyDiscardOutput(std::size_t n)
+{
+  thrust::host_vector<T> h_keys(n);
+  thrust::default_random_engine rng;
+
+  for (size_t i = 0, k = 0; i < n; i++)
+  {
+    h_keys[i] = static_cast<T>(k);
+    if (rng() % 10 == 0)
+    {
+      k++;
+    }
+  }
+  thrust::device_vector<T> d_keys = h_keys;
+
+  thrust::host_vector<T> h_vals(n);
+  for (size_t i = 0; i < n; i++)
+  {
+    h_vals[i] = static_cast<T>(i % 10);
+  }
+  thrust::device_vector<T> d_vals = h_vals;
+
+  auto out = thrust::make_discard_iterator();
+
+  // These are no-ops, but they should compile.
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out);
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                thrust::equal_to<T>{});
+  thrust::inclusive_scan_by_key(d_keys.cbegin(),
+                                d_keys.cend(),
+                                d_vals.cbegin(),
+                                out,
+                                thrust::equal_to<T>{},
+                                thrust::multiplies<T>{});
+}
+DECLARE_VARIABLE_UNITTEST(TestScanByKeyDiscardOutput);
+
+
+void TestScanByKeyLargeInput()
+{
+  const unsigned int N = 1 << 20;
+
+  thrust::host_vector<unsigned int> vals_sizes =
+    unittest::random_integers<unsigned int>(10);
+
+  thrust::host_vector<unsigned int> h_vals =
+    unittest::random_integers<unsigned int>(N);
+  thrust::device_vector<unsigned int> d_vals = h_vals;
+
+  thrust::host_vector<unsigned int> h_output(N, 0);
+  thrust::device_vector<unsigned int> d_output(N, 0);
+
+  for (unsigned int i = 0; i < vals_sizes.size(); i++)
+  {
+    const unsigned int n = vals_sizes[i] % N;
+
+    // define segments
+    thrust::host_vector<unsigned int> h_keys(n);
+    thrust::default_random_engine rng;
+    for (size_t j = 0, k = 0; j < n; j++)
+    {
+      h_keys[j] = static_cast<unsigned int>(k);
+      if (rng() % 100 == 0)
+      {
+        k++;
+      }
+    }
+    thrust::device_vector<unsigned int> d_keys = h_keys;
+
+    thrust::inclusive_scan_by_key(h_keys.begin(),
+                                  h_keys.begin() + n,
+                                  h_vals.begin(),
+                                  h_output.begin());
+    thrust::inclusive_scan_by_key(d_keys.begin(),
+                                  d_keys.begin() + n,
+                                  d_vals.begin(),
+                                  d_output.begin());
+    ASSERT_EQUAL(d_output, h_output);
+  }
+}
+DECLARE_UNITTEST(TestScanByKeyLargeInput);
+
+
+template <typename T, unsigned int N>
+void _TestScanByKeyWithLargeTypes()
+{
+  size_t n = (64 * 1024) / sizeof(FixedVector<T, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<T, N>> h_vals(n);
+  thrust::host_vector<FixedVector<T, N>> h_output(n);
+
+  thrust::default_random_engine rng;
+  for (size_t i = 0, k = 0; i < h_vals.size(); i++)
+  {
+    h_keys[i] = static_cast<unsigned int>(k);
+    h_vals[i] = FixedVector<T, N>(static_cast<T>(i));
+    if (rng() % 5 == 0)
+    {
+      k++;
+    }
+  }
+
+  thrust::device_vector<unsigned int> d_keys      = h_keys;
+  thrust::device_vector<FixedVector<T, N>> d_vals = h_vals;
+  thrust::device_vector<FixedVector<T, N>> d_output(n);
+
+  thrust::inclusive_scan_by_key(h_keys.begin(),
+                                h_keys.end(),
+                                h_vals.begin(),
+                                h_output.begin());
+  thrust::inclusive_scan_by_key(d_keys.begin(),
+                                d_keys.end(),
+                                d_vals.begin(),
+                                d_output.begin());
+
+  ASSERT_EQUAL_QUIET(h_output, d_output);
+}
+
+void TestScanByKeyWithLargeTypes()
+{
+  _TestScanByKeyWithLargeTypes<int, 1>();
+  _TestScanByKeyWithLargeTypes<int, 2>();
+  _TestScanByKeyWithLargeTypes<int, 4>();
+  _TestScanByKeyWithLargeTypes<int, 8>();
+
+  // too many resources requested for launch:
+  //_TestScanByKeyWithLargeTypes<int,   16>();
+  //_TestScanByKeyWithLargeTypes<int,   32>();
+
+  // too large to pass as argument
+  //_TestScanByKeyWithLargeTypes<int,   64>();
+  //_TestScanByKeyWithLargeTypes<int,  128>();
+  //_TestScanByKeyWithLargeTypes<int,  256>();
+  //_TestScanByKeyWithLargeTypes<int,  512>();
+  //_TestScanByKeyWithLargeTypes<int, 1024>();
+}
+DECLARE_UNITTEST(TestScanByKeyWithLargeTypes);
diff --git a/testing/scatter.cu b/testing/scatter.cu
index 2e918574e..ffd56f27c 100644
--- a/testing/scatter.cu
+++ b/testing/scatter.cu
@@ -10,8 +10,6 @@
 template <class Vector>
 void TestScatterSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector map(5);  // scatter indices
     Vector src(5);  // source vector
     Vector dst(8);  // destination vector
@@ -31,7 +29,7 @@ void TestScatterSimple(void)
     ASSERT_EQUAL(dst[6], 0);
     ASSERT_EQUAL(dst[7], 3);
 }
-DECLARE_VECTOR_UNITTEST(TestScatterSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterSimple);
 
 
 template<typename InputIterator1,
@@ -41,7 +39,7 @@ void scatter(my_system &system,
              InputIterator1,
              InputIterator1,
              InputIterator2,
-             RandomAccessIterator output)
+             RandomAccessIterator)
 {
     system.validate_dispatch();
 }
@@ -141,8 +139,6 @@ DECLARE_VARIABLE_UNITTEST(TestScatterToDiscardIterator);
 template <class Vector>
 void TestScatterIfSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector flg(5);  // predicate array
     Vector map(5);  // scatter indices
     Vector src(5);  // source vector
@@ -164,7 +160,7 @@ void TestScatterIfSimple(void)
     ASSERT_EQUAL(dst[6], 0);
     ASSERT_EQUAL(dst[7], 3);
 }
-DECLARE_VECTOR_UNITTEST(TestScatterIfSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterIfSimple);
 
 
 template<typename InputIterator1,
@@ -176,7 +172,7 @@ void scatter_if(my_system &system,
                 InputIterator1,
                 InputIterator2,
                 InputIterator3,
-                RandomAccessIterator output)
+                RandomAccessIterator)
 {
     system.validate_dispatch();
 }
@@ -284,8 +280,6 @@ DECLARE_VARIABLE_UNITTEST(TestScatterIfToDiscardIterator);
 template <typename Vector>
 void TestScatterCountingIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector source(10);
     thrust::sequence(source.begin(), source.end(), 0);
 
@@ -318,14 +312,12 @@ void TestScatterCountingIterator(void)
 
     ASSERT_EQUAL(output, map);
 }
-DECLARE_VECTOR_UNITTEST(TestScatterCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterCountingIterator);
 
 
 template <typename Vector>
 void TestScatterIfCountingIterator(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector source(10);
     thrust::sequence(source.begin(), source.end(), 0);
 
@@ -363,5 +355,5 @@ void TestScatterIfCountingIterator(void)
 
     ASSERT_EQUAL(output, map);
 }
-DECLARE_VECTOR_UNITTEST(TestScatterIfCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestScatterIfCountingIterator);
 
diff --git a/testing/sequence.cu b/testing/sequence.cu
index 48d9c19e7..6d29db4c3 100644
--- a/testing/sequence.cu
+++ b/testing/sequence.cu
@@ -5,7 +5,7 @@
 
 
 template<typename ForwardIterator>
-void sequence(my_system &system, ForwardIterator first, ForwardIterator)
+void sequence(my_system &system, ForwardIterator, ForwardIterator)
 {
     system.validate_dispatch();
 }
@@ -41,10 +41,9 @@ DECLARE_UNITTEST(TestSequenceDispatchImplicit);
 
 
 template <class Vector>
-void TestSequenceSimple(void)
+void TestSequenceSimple()
 {
-    typedef typename Vector::value_type T;
-    
+    using value_type = typename Vector::value_type;
     Vector v(5);
 
     thrust::sequence(v.begin(), v.end());
@@ -55,7 +54,7 @@ void TestSequenceSimple(void)
     ASSERT_EQUAL(v[3], 3);
     ASSERT_EQUAL(v[4], 4);
 
-    thrust::sequence(v.begin(), v.end(), 10);
+    thrust::sequence(v.begin(), v.end(), value_type{10});
 
     ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(v[1], 11);
@@ -63,7 +62,7 @@ void TestSequenceSimple(void)
     ASSERT_EQUAL(v[3], 13);
     ASSERT_EQUAL(v[4], 14);
     
-    thrust::sequence(v.begin(), v.end(), 10, 2);
+    thrust::sequence(v.begin(), v.end(), value_type{10}, value_type{2});
 
     ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(v[1], 12);
@@ -95,13 +94,14 @@ void TestSequence(size_t n)
 
     ASSERT_EQUAL(h_data, d_data);
     
-    thrust::sequence(h_data.begin(), h_data.end(), size_t(10), size_t(2));
-    thrust::sequence(d_data.begin(), d_data.end(), size_t(10), size_t(2));
+    thrust::sequence(h_data.begin(), h_data.end(), T(10), T(2));
+    thrust::sequence(d_data.begin(), d_data.end(), T(10), T(2));
 
     ASSERT_EQUAL(h_data, d_data);
 }
 DECLARE_VARIABLE_UNITTEST(TestSequence);
 
+
 template <typename T>
 void TestSequenceToDiscardIterator(size_t n)
 {
@@ -117,3 +117,54 @@ void TestSequenceToDiscardIterator(size_t n)
 }
 DECLARE_VARIABLE_UNITTEST(TestSequenceToDiscardIterator);
 
+
+void TestSequenceComplex()
+{
+  thrust::device_vector<thrust::complex<double> > m(64);
+  thrust::sequence(m.begin(), m.end());
+}
+DECLARE_UNITTEST(TestSequenceComplex);
+
+// A class that doesnt accept conversion from size_t but can be multiplied by a scalar
+struct Vector
+{
+    Vector() = default;
+    // Explicitly disable construction from size_t
+    Vector(std::size_t) = delete;
+    __host__ __device__ Vector(int x_, int y_) : x{x_}, y{y_} {}
+    Vector(const Vector&) = default;
+    Vector &operator=(const Vector&) = default;
+
+    int x, y;
+};
+
+// Vector-Vector addition
+__host__ __device__ Vector operator+(const Vector a, const Vector b)
+{
+  return Vector{a.x + b.x, a.y + b.y};
+}
+
+// Vector-Scalar Multiplication
+// Multiplication by std::size_t is required by thrust::sequence.
+__host__ __device__ Vector operator*(const std::size_t a, const Vector b)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
+__host__ __device__ Vector operator*(const Vector b, const std::size_t a)
+{
+  return Vector{static_cast<int>(a) * b.x, static_cast<int>(a) * b.y};
+}
+
+void TestSequenceNoSizeTConversion()
+{
+    thrust::device_vector<Vector> m(64);
+    thrust::sequence(m.begin(), m.end(), ::Vector{0, 0}, ::Vector{1, 2});
+
+    for (std::size_t i = 0; i < m.size(); ++i)
+    {
+        const ::Vector v = m[i];
+        ASSERT_EQUAL(static_cast<std::size_t>(v.x), i);
+        ASSERT_EQUAL(static_cast<std::size_t>(v.y), 2 * i);
+    }
+}
+DECLARE_UNITTEST(TestSequenceNoSizeTConversion);
diff --git a/testing/set_difference.cu b/testing/set_difference.cu
index b107bda36..5abc5f1fb 100644
--- a/testing/set_difference.cu
+++ b/testing/set_difference.cu
@@ -169,11 +169,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceEquivalentRanges);
 template<typename T>
 void TestSetDifferenceMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -181,8 +181,8 @@ void TestSetDifferenceMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
@@ -211,3 +211,32 @@ void TestSetDifferenceMultiset(const size_t n)
 }
 DECLARE_VARIABLE_UNITTEST(TestSetDifferenceMultiset);
 
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin(0);
+    thrust::counting_iterator<long long> end = begin + (1ll << magnitude);
+    thrust::counting_iterator<long long> end_longer = end + 1;
+    ASSERT_EQUAL(thrust::distance(begin, end), 1ll << magnitude);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_difference(thrust::device, begin, end_longer, begin, end, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*end);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/testing/set_difference_by_key.cu b/testing/set_difference_by_key.cu
index be68685fc..29dbb68fc 100644
--- a/testing/set_difference_by_key.cu
+++ b/testing/set_difference_by_key.cu
@@ -250,11 +250,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetDifferenceByKeyEquivalentRanges);
 template<typename T>
 void TestSetDifferenceByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -262,8 +262,8 @@ void TestSetDifferenceByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_intersection.cu b/testing/set_intersection.cu
index 3cae00f30..93ef05d74 100644
--- a/testing/set_intersection.cu
+++ b/testing/set_intersection.cu
@@ -209,20 +209,20 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionEquivalentRanges);
 template<typename T>
 void TestSetIntersectionMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
-    int temp = static_cast<int>(*i);
-    temp %= 13;
-    *i = temp;
+    int tmp = static_cast<int>(*i);
+    tmp %= 13;
+    *i = static_cast<T>(tmp);
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
@@ -251,3 +251,33 @@ void TestSetIntersectionMultiset(const size_t n)
 }
 DECLARE_VARIABLE_UNITTEST(TestSetIntersectionMultiset);
 
+// FIXME: disabled on Windows, because it causes a failure on the internal CI system in one specific configuration.
+// That failure will be tracked in a new NVBug, this is disabled to unblock submitting all the other changes.
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
+void TestSetDifferenceWithBigIndexesHelper(int magnitude)
+{
+    thrust::counting_iterator<long long> begin1(0);
+    thrust::counting_iterator<long long> begin2 = begin1 + (1ll << magnitude);
+    thrust::counting_iterator<long long> end1 = begin2 + 1;
+    thrust::counting_iterator<long long> end2 = begin2 + (1ll << magnitude);
+    ASSERT_EQUAL(thrust::distance(begin2, end1), 1);
+
+    thrust::device_vector<long long> result;
+    result.resize(1);
+    thrust::set_intersection(thrust::device, begin1, end1, begin2, end2, result.begin());
+
+    thrust::host_vector<long long> expected;
+    expected.push_back(*begin2);
+
+    ASSERT_EQUAL(result, expected);
+}
+
+void TestSetDifferenceWithBigIndexes()
+{
+    TestSetDifferenceWithBigIndexesHelper(30);
+    TestSetDifferenceWithBigIndexesHelper(31);
+    TestSetDifferenceWithBigIndexesHelper(32);
+    TestSetDifferenceWithBigIndexesHelper(33);
+}
+DECLARE_UNITTEST(TestSetDifferenceWithBigIndexes);
+#endif
diff --git a/testing/set_intersection_by_key.cu b/testing/set_intersection_by_key.cu
index 6b7d51fc8..d82ee04ad 100644
--- a/testing/set_intersection_by_key.cu
+++ b/testing/set_intersection_by_key.cu
@@ -234,11 +234,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetIntersectionByKeyEquivalentRanges);
 template<typename T>
 void TestSetIntersectionByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -246,8 +246,8 @@ void TestSetIntersectionByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_symmetric_difference.cu b/testing/set_symmetric_difference.cu
index b3e3c1493..dde145fec 100644
--- a/testing/set_symmetric_difference.cu
+++ b/testing/set_symmetric_difference.cu
@@ -168,11 +168,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceEquivalentRanges);
 template<typename T>
 void TestSetSymmetricDifferenceMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -180,8 +180,8 @@ void TestSetSymmetricDifferenceMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b(vec.begin() + n, vec.end());
 
   thrust::sort(h_a.begin(), h_a.end());
   thrust::sort(h_b.begin(), h_b.end());
diff --git a/testing/set_symmetric_difference_by_key.cu b/testing/set_symmetric_difference_by_key.cu
index c2688fdb8..98e416af8 100644
--- a/testing/set_symmetric_difference_by_key.cu
+++ b/testing/set_symmetric_difference_by_key.cu
@@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetSymmetricDifferenceByKeyEquivalentRanges);
 template<typename T>
 void TestSetSymmetricDifferenceByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -266,8 +266,8 @@ void TestSetSymmetricDifferenceByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/set_union_by_key.cu b/testing/set_union_by_key.cu
index ec8864941..7d58ebf4f 100644
--- a/testing/set_union_by_key.cu
+++ b/testing/set_union_by_key.cu
@@ -254,11 +254,11 @@ DECLARE_VARIABLE_UNITTEST(TestSetUnionByKeyEquivalentRanges);
 template<typename T>
 void TestSetUnionByKeyMultiset(const size_t n)
 {
-  thrust::host_vector<T> temp = unittest::random_integers<T>(2 * n);
+  thrust::host_vector<T> vec = unittest::random_integers<T>(2 * n);
 
   // restrict elements to [min,13)
-  for(typename thrust::host_vector<T>::iterator i = temp.begin();
-      i != temp.end();
+  for(typename thrust::host_vector<T>::iterator i = vec.begin();
+      i != vec.end();
       ++i)
   {
     int temp = static_cast<int>(*i);
@@ -266,8 +266,8 @@ void TestSetUnionByKeyMultiset(const size_t n)
     *i = temp;
   }
 
-  thrust::host_vector<T> h_a_key(temp.begin(), temp.begin() + n);
-  thrust::host_vector<T> h_b_key(temp.begin() + n, temp.end());
+  thrust::host_vector<T> h_a_key(vec.begin(), vec.begin() + n);
+  thrust::host_vector<T> h_b_key(vec.begin() + n, vec.end());
 
   thrust::sort(h_a_key.begin(), h_a_key.end());
   thrust::sort(h_b_key.begin(), h_b_key.end());
diff --git a/testing/shuffle.cu b/testing/shuffle.cu
new file mode 100644
index 000000000..77e660c00
--- /dev/null
+++ b/testing/shuffle.cu
@@ -0,0 +1,602 @@
+#include <thrust/detail/config.h>
+
+#include <map>
+#include <limits>
+#include <thrust/random.h>
+#include <thrust/sequence.h>
+#include <thrust/shuffle.h>
+#include <thrust/sort.h>
+#include <unittest/unittest.h>
+
+// Functions for performing statistical tests of randomness
+// From NIST-Statistical-Test-Suite
+// Licence:
+//  "This software was developed at the National Institute of Standards and
+//  Technology by employees of the Federal Government in the course of their
+//  official duties. Pursuant to title 17 Section 105 of the United States Code
+//  this software is not subject to copyright protection and is in the public
+//  domain. The NIST Statistical Test Suite is an experimental system. NIST
+//  assumes no responsibility whatsoever for its use by other parties, and makes
+//  no guarantees, expressed or implied, about its quality, reliability, or any
+//  other characteristic. We would appreciate acknowledgment if the software is
+//  used."
+class CephesFunctions {
+public:
+  static double cephes_igamc(double a, double x) {
+    double ans, ax, c, yc, r, t, y, z;
+    double pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if ((x <= 0) || (a <= 0))
+      return (1.0);
+
+    if ((x < 1.0) || (x < a))
+      return (1.e0 - cephes_igam(a, x));
+
+    ax = a * log(x) - x - cephes_lgam(a);
+
+    if (ax < -MAXLOG) {
+      printf("igamc: UNDERFLOW\n");
+      return 0.0;
+    }
+    ax = exp(ax);
+
+    /* continued fraction */
+    y = 1.0 - a;
+    z = x + y + 1.0;
+    c = 0.0;
+    pkm2 = 1.0;
+    qkm2 = x;
+    pkm1 = x + 1.0;
+    qkm1 = z * x;
+    ans = pkm1 / qkm1;
+
+    do {
+      c += 1.0;
+      y += 1.0;
+      z += 2.0;
+      yc = y * c;
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != 0) {
+        r = pk / qk;
+        t = fabs((ans - r) / r);
+        ans = r;
+      } else
+        t = 1.0;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (fabs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+    } while (t > MACHEP);
+
+    return ans * ax;
+  }
+
+private:
+  static constexpr double rel_error = 1E-12;
+
+  static constexpr double MACHEP = 1.11022302462515654042E-16;  // 2**-53
+  static constexpr double MAXLOG = 7.09782712893383996732224E2; // log(MAXNUM)
+  static constexpr double MAXNUM = 1.7976931348623158E308; // 2**1024*(1-MACHEP)
+  static constexpr double PI = 3.14159265358979323846;
+
+  static constexpr double big = 4.503599627370496e15;
+  static constexpr double biginv = 2.22044604925031308085e-16;
+
+  static int sgngam;
+
+  static double cephes_igam(double a, double x) {
+    double ans, ax, c, r;
+
+    if ((x <= 0) || (a <= 0))
+      return 0.0;
+
+    if ((x > 1.0) && (x > a))
+      return 1.e0 - cephes_igamc(a, x);
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * log(x) - x - cephes_lgam(a);
+    if (ax < -MAXLOG) {
+      printf("igam: UNDERFLOW\n");
+      return 0.0;
+    }
+    ax = exp(ax);
+
+    /* power series */
+    r = a;
+    c = 1.0;
+    ans = 1.0;
+
+    do {
+      r += 1.0;
+      c *= x / r;
+      ans += c;
+    } while (c / ans > MACHEP);
+
+    return ans * ax / a;
+  }
+
+  /* A[]: Stirling's formula expansion of log gamma
+   * B[], C[]: log gamma function between 2 and 3
+   */
+  static constexpr double A[] = {
+      0.000811614167470508488140545910738410384510643780,
+      -0.000595061904284301438315674115386855191900394857,
+      0.000793650340457716942620114419781884862459264696,
+      -0.002777777777300996942672073330982129846233874559,
+      0.083333333333333189929525985917280195280909538269};
+  static constexpr double B[] = {
+      -1378.251525691208598800585605204105377197265625,
+      -38801.631513463784358464181423187255859375,
+      -331612.9927388711948879063129425048828125,
+      -1162370.97492762305773794651031494140625,
+      -1721737.00820839661173522472381591796875,
+      -853555.66424576542340219020843505859375};
+  static constexpr double C[] = {
+      -351.8157014365234545039129443466663360595703125,
+      -17064.21066518811494461260735988616943359375,
+      -220528.59055385444662533700466156005859375,
+      -1139334.44367982516996562480926513671875,
+      -2532523.07177582941949367523193359375,
+      -2018891.4143353276886045932769775390625};
+
+  static constexpr double MAXLGM = 2.556348e305;
+
+  /* Logarithm of gamma function */
+  static double cephes_lgam(double x) {
+    double p, q, u, w, z;
+    int i;
+
+    sgngam = 1;
+
+    if (x < -34.0) {
+      q = -x;
+      w = cephes_lgam(q); /* note this modifies sgngam! */
+      p = floor(q);
+      if (p == q) {
+      lgsing:
+        goto loverf;
+      }
+      i = (int)p;
+      if ((i & 1) == 0)
+        sgngam = -1;
+      else
+        sgngam = 1;
+      z = q - p;
+      if (z > 0.5) {
+        p += 1.0;
+        z = p - q;
+      }
+      z = q * sin(PI * z);
+      if (z == 0.0)
+        goto lgsing;
+      /*      z = log(PI) - log( z ) - w;*/
+      z = log(PI) - log(z) - w;
+      return z;
+    }
+
+    if (x < 13.0) {
+      z = 1.0;
+      p = 0.0;
+      u = x;
+      while (u >= 3.0) {
+        p -= 1.0;
+        u = x + p;
+        z *= u;
+      }
+      while (u < 2.0) {
+        if (u == 0.0)
+          goto lgsing;
+        z /= u;
+        p += 1.0;
+        u = x + p;
+      }
+      if (z < 0.0) {
+        sgngam = -1;
+        z = -z;
+      } else
+        sgngam = 1;
+      if (u == 2.0)
+        return (log(z));
+      p -= 2.0;
+      x = x + p;
+      p = x * cephes_polevl(x, B, 5) /
+          cephes_p1evl(x, C, 6);
+
+      return log(z) + p;
+    }
+
+    if (x > MAXLGM) {
+    loverf:
+      printf("lgam: OVERFLOW\n");
+
+      return sgngam * MAXNUM;
+    }
+
+    q = (x - 0.5) * log(x) - x + log(sqrt(2 * PI));
+    if (x > 1.0e8)
+      return q;
+
+    p = 1.0 / (x * x);
+    if (x >= 1000.0)
+      q +=
+          ((7.9365079365079365079365e-4 * p - 2.7777777777777777777778e-3) * p +
+           0.0833333333333333333333) /
+          x;
+    else
+      q += cephes_polevl(p, A, 4) / x;
+
+    return q;
+  }
+
+  static double cephes_polevl(double x, const double *coef, int N) {
+    const double *p = coef;
+    double ans = *p++;
+    int i = N;
+    do
+      ans = ans * x + *p++;
+    while (--i);
+
+    return ans;
+  }
+
+  static double cephes_p1evl(double x, const double *coef, int N) {
+    const double *p = coef;
+    double ans = x + *p++;
+    int i = N - 1;
+
+    do
+      ans = ans * x + *p++;
+    while (--i);
+
+    return ans;
+  }
+
+  static double cephes_erf(double x) {
+    static const double two_sqrtpi = 1.128379167095512574;
+    double sum = x, term = x, xsqr = x * x;
+    int j = 1;
+
+    if (fabs(x) > 2.2)
+      return 1.0 - cephes_erfc(x);
+
+    do {
+      term *= xsqr / j;
+      sum -= term / (2 * j + 1);
+      j++;
+      term *= xsqr / j;
+      sum += term / (2 * j + 1);
+      j++;
+    } while (fabs(term) / sum > rel_error);
+
+    return two_sqrtpi * sum;
+  }
+
+  static double cephes_erfc(double x) {
+    static const double one_sqrtpi = 0.564189583547756287;
+    double a = 1, b = x, c = x, d = x * x + 0.5;
+    double q1, q2 = b / d, n = 1.0, t;
+
+    if (fabs(x) < 2.2)
+      return 1.0 - cephes_erf(x);
+    if (x < 0)
+      return 2.0 - cephes_erfc(-x);
+
+    do {
+      t = a * n + b * x;
+      a = b;
+      b = t;
+      t = c * n + d * x;
+      c = d;
+      d = t;
+      n += 0.5;
+      q1 = q2;
+      q2 = b / d;
+    } while (fabs(q1 - q2) / q2 > rel_error);
+
+    return one_sqrtpi * exp(-x * x) * q2;
+  }
+
+  static double cephes_normal(double x) {
+    double arg, result, sqrt2 = 1.414213562373095048801688724209698078569672;
+
+    if (x > 0) {
+      arg = x / sqrt2;
+      result = 0.5 * (1 + erf(arg));
+    } else {
+      arg = -x / sqrt2;
+      result = 0.5 * (1 - erf(arg));
+    }
+
+    return (result);
+  }
+};
+int CephesFunctions::sgngam = 0;
+constexpr double CephesFunctions::A[];
+constexpr double CephesFunctions::B[];
+constexpr double CephesFunctions::C[];
+
+template <typename Vector>
+void TestShuffleSimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(data.begin(), data.end());
+  thrust::default_random_engine g(2);
+  thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+  thrust::sort(shuffled.begin(), shuffled.end());
+  // Check all of our data is present
+  // This only tests for strange conditions like duplicated elements
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleSimple);
+
+template <typename Vector>
+void TestShuffleCopySimple() {
+  Vector data(5);
+  data[0] = 0;
+  data[1] = 1;
+  data[2] = 2;
+  data[3] = 3;
+  data[4] = 4;
+  Vector shuffled(5);
+  thrust::default_random_engine g(2);
+  thrust::shuffle_copy(data.begin(), data.end(), shuffled.begin(), g);
+  g.seed(2);
+  thrust::shuffle(data.begin(), data.end(), g);
+  ASSERT_EQUAL(shuffled, data);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleCopySimple);
+
+template <typename T>
+void TestHostDeviceIdentical(size_t m) {
+  thrust::host_vector<T> host_result(m);
+  thrust::host_vector<T> device_result(m);
+  thrust::sequence(host_result.begin(), host_result.end(), T{});
+  thrust::sequence(device_result.begin(), device_result.end(), T{});
+
+  thrust::default_random_engine host_g(183);
+  thrust::default_random_engine device_g(183);
+
+  thrust::shuffle(host_result.begin(), host_result.end(), host_g);
+  thrust::shuffle(device_result.begin(), device_result.end(), device_g);
+
+  ASSERT_EQUAL(device_result, host_result);
+}
+DECLARE_VARIABLE_UNITTEST(TestHostDeviceIdentical);
+
+template <typename T>
+void TestFunctionIsBijection(size_t m) {
+  thrust::default_random_engine host_g(0xD5);
+  thrust::default_random_engine device_g(0xD5);
+
+  thrust::system::detail::generic::feistel_bijection host_f(m, host_g);
+  thrust::system::detail::generic::feistel_bijection device_f(m, device_g);
+
+  if (static_cast<double>(host_f.nearest_power_of_two()) >= static_cast<double>(std::numeric_limits<T>::max()) || m == 0) {
+    return;
+  }
+
+  thrust::host_vector<T> host_result(host_f.nearest_power_of_two());
+  thrust::host_vector<T> device_result(device_f.nearest_power_of_two());
+  thrust::sequence(host_result.begin(), host_result.end(), T{});
+  thrust::sequence(device_result.begin(), device_result.end(), T{});
+
+  thrust::transform(host_result.begin(), host_result.end(), host_result.begin(),
+                    host_f);
+  thrust::transform(device_result.begin(), device_result.end(),
+                    device_result.begin(), device_f);
+
+  ASSERT_EQUAL(host_result, device_result);
+
+  thrust::sort(host_result.begin(), host_result.end());
+  // Assert all values were generated exactly once
+  for (uint64_t i = 0; i < m; i++) {
+    ASSERT_EQUAL((uint64_t)host_result[i], i);
+  }
+}
+DECLARE_VARIABLE_UNITTEST(TestFunctionIsBijection);
+
+void TestBijectionLength() {
+  thrust::default_random_engine g(0xD5);
+
+  uint64_t m = 31;
+  thrust::system::detail::generic::feistel_bijection f(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
+
+  m = 32;
+  f = thrust::system::detail::generic::feistel_bijection(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(32));
+
+  m = 1;
+  f = thrust::system::detail::generic::feistel_bijection(m, g);
+  ASSERT_EQUAL(f.nearest_power_of_two(), uint64_t(16));
+}
+DECLARE_UNITTEST(TestBijectionLength);
+
+// Individual input keys should be permuted to output locations with uniform
+// probability. Perform chi-squared test with confidence 99.9%.
+template <typename Vector>
+void TestShuffleKeyPosition() {
+  typedef typename Vector::value_type T;
+  size_t m = 20;
+  size_t num_samples = 100;
+  thrust::host_vector<size_t> index_sum(m, 0);
+  thrust::host_vector<T> sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+
+  thrust::default_random_engine g(0xD5);
+  for (size_t i = 0; i < num_samples; i++) {
+    Vector shuffled(sequence.begin(), sequence.end());
+    thrust::shuffle(shuffled.begin(), shuffled.end(), g);
+    thrust::host_vector<T> tmp(shuffled.begin(), shuffled.end());
+
+    for (auto j = 0ull; j < m; j++) {
+      index_sum[tmp[j]] += j;
+    }
+  }
+
+  double expected_average_position = static_cast<double>(m - 1) / 2;
+  double chi_squared = 0.0;
+  for (auto j = 0ull; j < m; j++) {
+    double average_position = static_cast<double>(index_sum[j]) / num_samples;
+    chi_squared += std::pow(expected_average_position - average_position, 2) /
+                   expected_average_position;
+  }
+  // Tabulated chi-squared critical value for m-1=19 degrees of freedom
+  // and 99.9% confidence
+  double confidence_threshold = 43.82;
+  ASSERT_LESS(chi_squared, confidence_threshold);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleKeyPosition);
+
+struct vector_compare {
+  template <typename VectorT>
+  bool operator()(const VectorT &a, const VectorT &b) const {
+    for (auto i = 0ull; i < a.size(); i++) {
+      if (a[i] < b[i])
+        return true;
+      if (a[i] > b[i])
+        return false;
+    }
+    return false;
+  }
+};
+
+// Brute force check permutations are uniformly distributed on small input
+// Uses a chi-squared test indicating 99% confidence the output is uniformly
+// random
+template <typename Vector>
+void TestShuffleUniformPermutation() {
+  typedef typename Vector::value_type T;
+
+  size_t m = 5;
+  size_t num_samples = 1000;
+  size_t total_permutations = 1 * 2 * 3 * 4 * 5;
+  std::map<thrust::host_vector<T>, size_t, vector_compare> permutation_counts;
+  Vector sequence(m);
+  thrust::sequence(sequence.begin(), sequence.end(), T(0));
+  thrust::default_random_engine g(0xD5);
+  for (auto i = 0ull; i < num_samples; i++) {
+    thrust::shuffle(sequence.begin(), sequence.end(), g);
+    thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+    permutation_counts[tmp]++;
+  }
+
+  ASSERT_EQUAL(permutation_counts.size(), total_permutations);
+
+  double chi_squared = 0.0;
+  double expected_count = static_cast<double>(num_samples) / total_permutations;
+  for (auto kv : permutation_counts) {
+    chi_squared += std::pow(expected_count - kv.second, 2) / expected_count;
+  }
+  double p_score = CephesFunctions::cephes_igamc(
+      (double)(total_permutations - 1) / 2.0, chi_squared / 2.0);
+  ASSERT_GREATER(p_score, 0.01);
+}
+DECLARE_VECTOR_UNITTEST(TestShuffleUniformPermutation);
+
+template <typename Vector>
+void TestShuffleEvenSpacingBetweenOccurances() {
+  typedef typename Vector::value_type T;
+  const uint64_t shuffle_size = 10;
+  const uint64_t num_samples = 1000;
+
+  thrust::host_vector<T> h_results;
+  Vector sequence(shuffle_size);
+  thrust::sequence(sequence.begin(), sequence.end(), 0);
+  thrust::default_random_engine g(0xD6);
+  for (auto i = 0ull; i < num_samples; i++) {
+    thrust::shuffle(sequence.begin(), sequence.end(), g);
+    thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+    h_results.insert(h_results.end(), sequence.begin(), sequence.end());
+  }
+
+  std::vector<std::vector<std::vector<uint64_t>>> distance_between(
+      num_samples, std::vector<std::vector<uint64_t>>(
+                       num_samples, std::vector<uint64_t>(shuffle_size, 0)));
+
+  for (uint64_t sample = 0; sample < num_samples; sample++) {
+    for (uint64_t i = 0; i < shuffle_size - 1; i++) {
+      for (uint64_t j = 1; j < shuffle_size - i; j++) {
+        T val_1 = h_results[sample * shuffle_size + i];
+        T val_2 = h_results[sample * shuffle_size + i + j];
+        distance_between[val_1][val_2][j]++;
+        distance_between[val_2][val_1][shuffle_size - j]++;
+      }
+    }
+  }
+
+  const double expected_occurances = (double)num_samples / (shuffle_size - 1);
+  for (uint64_t val_1 = 0; val_1 < shuffle_size; val_1++) {
+    for (uint64_t val_2 = val_1 + 1; val_2 < shuffle_size; val_2++) {
+      double chi_squared = 0.0;
+      auto &distances = distance_between[val_1][val_2];
+      for (uint64_t i = 1; i < shuffle_size; i++) {
+        chi_squared += std::pow((double)distances[i] - expected_occurances, 2) /
+                       expected_occurances;
+      }
+
+      double p_score = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 2) / 2.0, chi_squared / 2.0);
+      ASSERT_GREATER(p_score, 0.01);
+    }
+  }
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenSpacingBetweenOccurances);
+
+template <typename Vector>
+void TestShuffleEvenDistribution() {
+  typedef typename Vector::value_type T;
+  const uint64_t shuffle_sizes[] = {10, 100, 500};
+  thrust::default_random_engine g(0xD5);
+  for (auto shuffle_size : shuffle_sizes) {
+    if(shuffle_size > (uint64_t)std::numeric_limits<T>::max())
+      continue;
+    const uint64_t num_samples = shuffle_size == 500 ? 1000 : 200;
+
+    std::vector<uint64_t> counts(shuffle_size * shuffle_size, 0);
+    Vector sequence(shuffle_size);
+    for (auto i = 0ull; i < num_samples; i++) {
+      thrust::sequence(sequence.begin(), sequence.end(), 0);
+      thrust::shuffle(sequence.begin(), sequence.end(), g);
+      thrust::host_vector<T> tmp(sequence.begin(), sequence.end());
+      for (uint64_t j = 0; j < shuffle_size; j++) {
+        assert(j < tmp.size());
+        counts.at(j * shuffle_size + tmp[j])++;
+      }
+    }
+
+    const double expected_occurances = (double)num_samples / shuffle_size;
+    for (uint64_t i = 0; i < shuffle_size; i++) {
+      double chi_squared_pos = 0.0;
+      double chi_squared_num = 0.0;
+      for (uint64_t j = 0; j < shuffle_size; j++) {
+        auto count_pos = counts.at(i * shuffle_size + j);
+        auto count_num = counts.at(j * shuffle_size + i);
+        chi_squared_pos +=
+            pow((double)count_pos - expected_occurances, 2) / expected_occurances;
+        chi_squared_num +=
+            pow((double)count_num - expected_occurances, 2) / expected_occurances;
+      }
+
+      double p_score_pos = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 1) / 2.0, chi_squared_pos / 2.0);
+      ASSERT_GREATER(p_score_pos, 0.001 / (double)shuffle_size);
+
+      double p_score_num = CephesFunctions::cephes_igamc(
+          (double)(shuffle_size - 1) / 2.0, chi_squared_num / 2.0);
+      ASSERT_GREATER(p_score_num, 0.001 / (double)shuffle_size);
+    }
+  }
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestShuffleEvenDistribution);
diff --git a/testing/sort.cu b/testing/sort.cu
index c620e8239..e460655c4 100644
--- a/testing/sort.cu
+++ b/testing/sort.cu
@@ -64,8 +64,6 @@ void InitializeSimpleKeySortTest(Vector& unsorted_keys, Vector& sorted_keys)
 template <class Vector>
 void TestSortSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector unsorted_keys;
     Vector   sorted_keys;
 
diff --git a/testing/stable_sort.cu b/testing/stable_sort.cu
index b51240171..c7cdb3e52 100644
--- a/testing/stable_sort.cu
+++ b/testing/stable_sort.cu
@@ -87,7 +87,7 @@ void TestStableSortSimple(void)
 
     ASSERT_EQUAL(unsorted_keys,   sorted_keys);
 }
-DECLARE_VECTOR_UNITTEST(TestStableSortSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortSimple);
 
 
 template <typename T>
@@ -171,5 +171,5 @@ void TestStableSortWithIndirection(void)
     ASSERT_EQUAL(data[5], T(5));
     ASSERT_EQUAL(data[6], T(2));
 }
-DECLARE_VECTOR_UNITTEST(TestStableSortWithIndirection);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortWithIndirection);
 
diff --git a/testing/stable_sort_by_key.cu b/testing/stable_sort_by_key.cu
index c43c40b6f..e3736542d 100644
--- a/testing/stable_sort_by_key.cu
+++ b/testing/stable_sort_by_key.cu
@@ -92,7 +92,7 @@ void TestStableSortByKeySimple(void)
     ASSERT_EQUAL(unsorted_keys,   sorted_keys);
     ASSERT_EQUAL(unsorted_values, sorted_values);
 }
-DECLARE_VECTOR_UNITTEST(TestStableSortByKeySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestStableSortByKeySimple);
 
 
 template <typename T>
diff --git a/testing/stable_sort_by_key_large.cu b/testing/stable_sort_by_key_large.cu
deleted file mode 100644
index fc69de64c..000000000
--- a/testing/stable_sort_by_key_large.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <unittest/unittest.h>
-#include <thrust/sort.h>
-#include <thrust/functional.h>
-
-template <typename T>
-struct less_div_10
-{
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 < ((int) rhs) / 10;}
-};
-
-template <typename T>
-struct greater_div_10
-{
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return ((int) lhs) / 10 > ((int) rhs) / 10;}
-};
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeKeys(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector< FixedVector<T,N> > h_keys(n);
-    thrust::host_vector<   unsigned int   > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = FixedVector<T,N>(rand());
-        h_vals[i] = i;
-    }
-
-    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
-    thrust::device_vector<   unsigned int   > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeKeys(void)
-{
-    _TestStableSortByKeyWithLargeKeys<int,    4>();
-    _TestStableSortByKeyWithLargeKeys<int,    8>();
-    _TestStableSortByKeyWithLargeKeys<int,   16>();
-
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeKeys<int,   32>();
-//    _TestStableSortByKeyWithLargeKeys<int,   64>();
-//    _TestStableSortByKeyWithLargeKeys<int,  128>();
-//    _TestStableSortByKeyWithLargeKeys<int,  256>();
-//    _TestStableSortByKeyWithLargeKeys<int,  512>();
-//    _TestStableSortByKeyWithLargeKeys<int, 1024>();
-//    _TestStableSortByKeyWithLargeKeys<int, 2048>();
-//    _TestStableSortByKeyWithLargeKeys<int, 4096>();
-//    _TestStableSortByKeyWithLargeKeys<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys);
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeValues(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector<   unsigned int   > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = rand();
-        h_vals[i] = FixedVector<T,N>(i);
-    }
-
-    thrust::device_vector<   unsigned int   > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-
-    // so cuda::stable_merge_sort_by_key() is called
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin(), greater_div_10<unsigned int>());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin(), greater_div_10<unsigned int>());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeValues(void)
-{
-    _TestStableSortByKeyWithLargeValues<int,    4>();
-    _TestStableSortByKeyWithLargeValues<int,    8>();
-    _TestStableSortByKeyWithLargeValues<int,   16>();
-    
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeValues<int,   32>();
-//    _TestStableSortByKeyWithLargeValues<int,   64>();
-//    _TestStableSortByKeyWithLargeValues<int,  128>();
-//    _TestStableSortByKeyWithLargeValues<int,  256>();
-//    _TestStableSortByKeyWithLargeValues<int,  512>();
-//    _TestStableSortByKeyWithLargeValues<int, 1024>();
-//    _TestStableSortByKeyWithLargeValues<int, 2048>();
-//    _TestStableSortByKeyWithLargeValues<int, 4096>();
-//    _TestStableSortByKeyWithLargeValues<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues);
-
-
-template <typename T, unsigned int N>
-void _TestStableSortByKeyWithLargeKeysAndValues(void)
-{
-    size_t n = (128 * 1024) / sizeof(FixedVector<T,N>);
-
-    thrust::host_vector< FixedVector<T,N> > h_keys(n);
-    thrust::host_vector< FixedVector<T,N> > h_vals(n);
-
-    for(size_t i = 0; i < n; i++)
-    {
-        h_keys[i] = FixedVector<T,N>(rand());
-        h_vals[i] = FixedVector<T,N>(i);
-    }
-
-    thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
-    thrust::device_vector< FixedVector<T,N> > d_vals = h_vals;
-    
-    thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
-    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
-
-    ASSERT_EQUAL_QUIET(h_keys, d_keys);
-    ASSERT_EQUAL_QUIET(h_vals, d_vals);
-}
-
-void TestStableSortByKeyWithLargeKeysAndValues(void)
-{
-    _TestStableSortByKeyWithLargeKeysAndValues<int,    4>();
-    _TestStableSortByKeyWithLargeKeysAndValues<int,    8>();
-    _TestStableSortByKeyWithLargeKeysAndValues<int,   16>();
-
-// XXX these take too long to compile
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,   32>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,   64>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  128>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  256>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int,  512>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 1024>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 2048>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 4096>();
-//    _TestStableSortByKeyWithLargeKeysAndValues<int, 8192>();
-}
-DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues);
-
diff --git a/testing/stable_sort_by_key_large_keys.cu b/testing/stable_sort_by_key_large_keys.cu
new file mode 100644
index 000000000..9ea4d51f8
--- /dev/null
+++ b/testing/stable_sort_by_key_large_keys.cu
@@ -0,0 +1,38 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeKeys(void)
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<FixedVector<int, N>> h_keys(n);
+  thrust::host_vector<unsigned int> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<int>()(uint_i);
+    h_keys[i]           = FixedVector<int, N>(rand_int);
+    h_vals[i]           = uint_i;
+  }
+
+  thrust::device_vector<FixedVector<int, N>> d_keys = h_keys;
+  thrust::device_vector<unsigned int> d_vals        = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeys(void)
+{
+  _TestStableSortByKeyWithLargeKeys<4>();
+  _TestStableSortByKeyWithLargeKeys<8>();
+  _TestStableSortByKeyWithLargeKeys<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeys);
diff --git a/testing/stable_sort_by_key_large_keys_and_values.cu b/testing/stable_sort_by_key_large_keys_and_values.cu
new file mode 100644
index 000000000..eed6b6efa
--- /dev/null
+++ b/testing/stable_sort_by_key_large_keys_and_values.cu
@@ -0,0 +1,38 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeKeysAndValues()
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<FixedVector<int, N>> h_keys(n);
+  thrust::host_vector<FixedVector<int, N>> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<int>()(uint_i);
+    h_keys[i]           = FixedVector<int, N>(rand_int);
+    h_vals[i]           = FixedVector<int, N>(static_cast<int>(i));
+  }
+
+  thrust::device_vector<FixedVector<int, N>> d_keys = h_keys;
+  thrust::device_vector<FixedVector<int, N>> d_vals = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeKeysAndValues()
+{
+  _TestStableSortByKeyWithLargeKeysAndValues<4>();
+  _TestStableSortByKeyWithLargeKeysAndValues<8>();
+  _TestStableSortByKeyWithLargeKeysAndValues<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeKeysAndValues);
diff --git a/testing/stable_sort_by_key_large_values.cu b/testing/stable_sort_by_key_large_values.cu
new file mode 100644
index 000000000..b37753973
--- /dev/null
+++ b/testing/stable_sort_by_key_large_values.cu
@@ -0,0 +1,60 @@
+#include <thrust/functional.h>
+#include <thrust/sort.h>
+
+#include <unittest/unittest.h>
+
+template <typename T>
+struct greater_div_10
+{
+  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const
+  {
+    return ((int)lhs) / 10 > ((int)rhs) / 10;
+  }
+};
+
+template <unsigned int N>
+void _TestStableSortByKeyWithLargeValues()
+{
+  size_t n = (128 * 1024) / sizeof(FixedVector<int, N>);
+
+  thrust::host_vector<unsigned int> h_keys(n);
+  thrust::host_vector<FixedVector<int, N>> h_vals(n);
+
+  for (size_t i = 0; i < n; i++)
+  {
+    const auto uint_i   = static_cast<unsigned int>(i);
+    const auto rand_int = unittest::generate_random_integer<unsigned int>()(uint_i);
+    h_keys[i]           = rand_int;
+    h_vals[i]           = FixedVector<int, N>(static_cast<int>(i));
+  }
+
+  thrust::device_vector<unsigned int> d_keys        = h_keys;
+  thrust::device_vector<FixedVector<int, N>> d_vals = h_vals;
+
+  thrust::stable_sort_by_key(h_keys.begin(), h_keys.end(), h_vals.begin());
+  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_vals.begin());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+
+  // so cuda::stable_merge_sort_by_key() is called
+  thrust::stable_sort_by_key(h_keys.begin(),
+                             h_keys.end(),
+                             h_vals.begin(),
+                             greater_div_10<unsigned int>());
+  thrust::stable_sort_by_key(d_keys.begin(),
+                             d_keys.end(),
+                             d_vals.begin(),
+                             greater_div_10<unsigned int>());
+
+  ASSERT_EQUAL_QUIET(h_keys, d_keys);
+  ASSERT_EQUAL_QUIET(h_vals, d_vals);
+}
+
+void TestStableSortByKeyWithLargeValues()
+{
+  _TestStableSortByKeyWithLargeValues<4>();
+  _TestStableSortByKeyWithLargeValues<8>();
+  _TestStableSortByKeyWithLargeValues<16>();
+}
+DECLARE_UNITTEST(TestStableSortByKeyWithLargeValues);
diff --git a/testing/stable_sort_large.cu b/testing/stable_sort_large.cu
index b89750b38..2b1907cea 100644
--- a/testing/stable_sort_large.cu
+++ b/testing/stable_sort_large.cu
@@ -2,6 +2,7 @@
 #include <thrust/sort.h>
 #include <thrust/functional.h>
 
+
 template <typename T, unsigned int N>
 void _TestStableSortWithLargeKeys(void)
 {
@@ -10,6 +11,7 @@ void _TestStableSortWithLargeKeys(void)
     thrust::host_vector< FixedVector<T,N> > h_keys(n);
 
     for(size_t i = 0; i < n; i++)
+        // XXX Use proper random number generation facility.
         h_keys[i] = FixedVector<T,N>(rand());
 
     thrust::device_vector< FixedVector<T,N> > d_keys = h_keys;
@@ -22,22 +24,9 @@ void _TestStableSortWithLargeKeys(void)
 
 void TestStableSortWithLargeKeys(void)
 {
-    _TestStableSortWithLargeKeys<int,    1>();
     _TestStableSortWithLargeKeys<int,    2>();
-    _TestStableSortWithLargeKeys<int,    4>();
-    _TestStableSortWithLargeKeys<int,    8>();
-    _TestStableSortWithLargeKeys<int,   16>();
-    _TestStableSortWithLargeKeys<int,   32>();
-    _TestStableSortWithLargeKeys<int,   64>();
+    _TestStableSortWithLargeKeys<int,   17>();
     _TestStableSortWithLargeKeys<int,  128>();
-    _TestStableSortWithLargeKeys<int,  256>();
-    _TestStableSortWithLargeKeys<int,  512>();
-    _TestStableSortWithLargeKeys<int, 1024>();
-
-// XXX these take too long to compile
-//    _TestStableSortWithLargeKeys<int, 2048>();
-//    _TestStableSortWithLargeKeys<int, 4096>();
-//    _TestStableSortWithLargeKeys<int, 8192>();
 }
 DECLARE_UNITTEST(TestStableSortWithLargeKeys);
 
diff --git a/testing/swap_ranges.cu b/testing/swap_ranges.cu
index dfe78184d..843c66240 100644
--- a/testing/swap_ranges.cu
+++ b/testing/swap_ranges.cu
@@ -1,6 +1,6 @@
 #include <unittest/unittest.h>
 #include <thrust/swap.h>
-#include <thrust/iterator/iterator_traits.h> 
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/retag.h>
 #include <thrust/system/cpp/memory.h>
 
@@ -55,8 +55,6 @@ DECLARE_UNITTEST(TestSwapRangesDispatchImplicit);
 template <class Vector>
 void TestSwapRangesSimple(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v1(5);
     v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
 
@@ -70,7 +68,7 @@ void TestSwapRangesSimple(void)
     ASSERT_EQUAL(v1[2], 7);
     ASSERT_EQUAL(v1[3], 8);
     ASSERT_EQUAL(v1[4], 9);
-    
+
     ASSERT_EQUAL(v2[0], 0);
     ASSERT_EQUAL(v2[1], 1);
     ASSERT_EQUAL(v2[2], 2);
@@ -90,11 +88,11 @@ void TestSwapRanges(const size_t n)
     thrust::host_vector<T>    h2 = a2;
     thrust::device_vector<T>  d1 = a1;
     thrust::device_vector<T>  d2 = a2;
-  
+
     thrust::swap_ranges(h1.begin(), h1.end(), h2.begin());
     thrust::swap_ranges(d1.begin(), d1.end(), d2.begin());
 
-    ASSERT_EQUAL(h1, a2);  
+    ASSERT_EQUAL(h1, a2);
     ASSERT_EQUAL(d1, a2);
     ASSERT_EQUAL(h2, a1);
     ASSERT_EQUAL(d2, a1);
@@ -149,6 +147,10 @@ struct type_with_swap
     return m_x == other.m_x && m_swapped == other.m_swapped;
   }
 
+#if THRUST_CPP_DIALECT >= 2011
+  type_with_swap & operator=(const type_with_swap &) = default;
+#endif
+
   int m_x;
   bool m_swapped;
 };
diff --git a/testing/tabulate.cu b/testing/tabulate.cu
index fc2e728b7..25c6e40ac 100644
--- a/testing/tabulate.cu
+++ b/testing/tabulate.cu
@@ -6,7 +6,7 @@
 
 
 template<typename ForwardIterator, typename UnaryOperation>
-void tabulate(my_system &system, ForwardIterator first, ForwardIterator, UnaryOperation unary_op)
+void tabulate(my_system &system, ForwardIterator, ForwardIterator, UnaryOperation)
 {
   system.validate_dispatch();
 }
@@ -24,7 +24,7 @@ DECLARE_UNITTEST(TestTabulateDispatchExplicit);
 
 
 template<typename ForwardIterator, typename UnaryOperation>
-void tabulate(my_tag, ForwardIterator first, ForwardIterator, UnaryOperation unary_op)
+void tabulate(my_tag, ForwardIterator first, ForwardIterator, UnaryOperation)
 {
   *first = 13;
 }
diff --git a/testing/transform.cu b/testing/transform.cu
index 630e47393..7e3c3e60f 100644
--- a/testing/transform.cu
+++ b/testing/transform.cu
@@ -12,7 +12,7 @@ template <class Vector>
 void TestTransformUnarySimple(void)
 {
     typedef typename Vector::value_type T;
-    
+
     typename Vector::iterator iter;
 
     Vector input(3);
@@ -22,8 +22,8 @@ void TestTransformUnarySimple(void)
     result[0] = -1; result[1] =  2; result[2] = -3;
 
     iter = thrust::transform(input.begin(), input.end(), output.begin(), thrust::negate<T>());
-    
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(output, result);
 }
 DECLARE_VECTOR_UNITTEST(TestTransformUnarySimple);
@@ -81,7 +81,7 @@ template <class Vector>
 void TestTransformIfUnaryNoStencilSimple(void)
 {
     typedef typename Vector::value_type T;
-    
+
     typename Vector::iterator iter;
 
     Vector input(3);
@@ -89,15 +89,15 @@ void TestTransformIfUnaryNoStencilSimple(void)
     Vector result(3);
 
     input[0]   =  0; input[1]   = -2; input[2]   =  0;
-    output[0]  = -1; output[1]  = -2; output[2]  = -3; 
+    output[0]  = -1; output[1]  = -2; output[2]  = -3;
     result[0]  = -1; result[1]  =  2; result[2]  = -3;
 
     iter = thrust::transform_if(input.begin(), input.end(),
                                 output.begin(),
                                 thrust::negate<T>(),
                                 thrust::identity<T>());
-    
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(output, result);
 }
 DECLARE_VECTOR_UNITTEST(TestTransformIfUnaryNoStencilSimple);
@@ -169,7 +169,7 @@ template <class Vector>
 void TestTransformIfUnarySimple(void)
 {
     typedef typename Vector::value_type T;
-    
+
     typename Vector::iterator iter;
 
     Vector input(3);
@@ -178,7 +178,7 @@ void TestTransformIfUnarySimple(void)
     Vector result(3);
 
     input[0]   =  1; input[1]   = -2; input[2]   =  3;
-    output[0]  =  1; output[1]  =  2; output[2]  =  3; 
+    output[0]  =  1; output[1]  =  2; output[2]  =  3;
     stencil[0] =  1; stencil[1] =  0; stencil[2] =  1;
     result[0]  = -1; result[1]  =  2; result[2]  = -3;
 
@@ -187,8 +187,8 @@ void TestTransformIfUnarySimple(void)
                                 output.begin(),
                                 thrust::negate<T>(),
                                 thrust::identity<T>());
-    
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(output, result);
 }
 DECLARE_VECTOR_UNITTEST(TestTransformIfUnarySimple);
@@ -274,8 +274,8 @@ void TestTransformBinarySimple(void)
     result[0] =  5; result[1] = -7; result[2] = -3;
 
     iter = thrust::transform(input1.begin(), input1.end(), input2.begin(), output.begin(), thrust::minus<T>());
-    
-    ASSERT_EQUAL(iter - output.begin(), input1.size());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
     ASSERT_EQUAL(output, result);
 }
 DECLARE_VECTOR_UNITTEST(TestTransformBinarySimple);
@@ -339,7 +339,7 @@ template <class Vector>
 void TestTransformIfBinarySimple(void)
 {
     typedef typename Vector::value_type T;
-    
+
     typename Vector::iterator iter;
 
     Vector input1(3);
@@ -362,8 +362,8 @@ void TestTransformIfBinarySimple(void)
                                 output.begin(),
                                 thrust::minus<T>(),
                                 thrust::not1(identity));
-    
-    ASSERT_EQUAL(iter - output.begin(), input1.size());
+
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input1.size());
     ASSERT_EQUAL(output, result);
 }
 DECLARE_VECTOR_UNITTEST(TestTransformIfBinarySimple);
@@ -454,7 +454,7 @@ void TestTransformUnary(const size_t n)
 
     thrust::transform(h_input.begin(), h_input.end(), h_output.begin(), thrust::negate<T>());
     thrust::transform(d_input.begin(), d_input.end(), d_output.begin(), thrust::negate<T>());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformUnary);
@@ -473,7 +473,7 @@ void TestTransformUnaryToDiscardIterator(const size_t n)
       thrust::transform(d_input.begin(), d_input.end(), thrust::make_discard_iterator(), thrust::negate<T>());
 
     thrust::discard_iterator<> reference(n);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
@@ -521,7 +521,7 @@ void TestTransformUnaryToDiscardIteratorZipped(const size_t n)
     thrust::discard_iterator<> reference(n);
 
     ASSERT_EQUAL(h_output, d_output);
-    
+
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(h_result.get_iterator_tuple()));
     ASSERT_EQUAL_QUIET(reference, thrust::get<1>(d_result.get_iterator_tuple()));
 }
@@ -554,7 +554,7 @@ void TestTransformIfUnaryNoStencil(const size_t n)
     thrust::transform_if(d_input.begin(), d_input.end(),
                          d_output.begin(),
                          thrust::negate<T>(), is_positive());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformIfUnaryNoStencil);
@@ -580,7 +580,7 @@ void TestTransformIfUnary(const size_t n)
                           d_stencil.begin(),
                           d_output.begin(),
                           thrust::negate<T>(), is_positive());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformIfUnary);
@@ -608,7 +608,7 @@ void TestTransformIfUnaryToDiscardIterator(const size_t n)
                            thrust::negate<T>(), is_positive());
 
     thrust::discard_iterator<> reference(n);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
@@ -628,12 +628,12 @@ void TestTransformBinary(const size_t n)
 
     thrust::transform(h_input1.begin(), h_input1.end(), h_input2.begin(), h_output.begin(), thrust::minus<T>());
     thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), d_output.begin(), thrust::minus<T>());
-    
+
     ASSERT_EQUAL(h_output, d_output);
-    
+
     thrust::transform(h_input1.begin(), h_input1.end(), h_input2.begin(), h_output.begin(), thrust::multiplies<T>());
     thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), d_output.begin(), thrust::multiplies<T>());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformBinary);
@@ -653,7 +653,7 @@ void TestTransformBinaryToDiscardIterator(const size_t n)
       thrust::transform(d_input1.begin(), d_input1.end(), d_input2.begin(), thrust::make_discard_iterator(), thrust::minus<T>());
 
     thrust::discard_iterator<> reference(n);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
@@ -684,12 +684,12 @@ void TestTransformIfBinary(const size_t n)
                          d_stencil.begin(),
                          d_output.begin(),
                          thrust::minus<T>(), is_positive());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 
     h_stencil = unittest::random_integers<T>(n);
     d_stencil = h_stencil;
-    
+
     thrust::transform_if(h_input1.begin(), h_input1.end(),
                          h_input2.begin(),
                          h_stencil.begin(),
@@ -701,7 +701,7 @@ void TestTransformIfBinary(const size_t n)
                          d_stencil.begin(),
                          d_output.begin(),
                          thrust::multiplies<T>(), is_positive());
-    
+
     ASSERT_EQUAL(h_output, d_output);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformIfBinary);
@@ -733,23 +733,33 @@ void TestTransformIfBinaryToDiscardIterator(const size_t n)
                            thrust::minus<T>(), is_positive());
 
     thrust::discard_iterator<> reference(n);
-    
+
     ASSERT_EQUAL_QUIET(reference, h_result);
     ASSERT_EQUAL_QUIET(reference, d_result);
 }
 DECLARE_VARIABLE_UNITTEST(TestTransformIfBinaryToDiscardIterator);
 
 
-template <class T>
-  void TestTransformUnaryCountingIterator(size_t n)
+#if ((__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400) || defined(__INTEL_COMPILER)
+template <typename T>
+void TestTransformUnaryCountingIterator()
 {
-    // GCC 4.4.x has a known failure with auto-vectorization (due to -O3 or -ftree-vectorize) of this test
+    // G++ 4.4.x has a known failure with auto-vectorization (due to -O3 or
+    // -ftree-vectorize) of this test.
     // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43251
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400
+
+    // ICPC has a known failure with auto-vectorization (due to -O2 or
+    // higher) of this test.
+    // See nvbug 200326708.
     KNOWN_FAILURE;
+}
 #else
-    // be careful not to generate a range larger than we can represent
-    n = thrust::min<size_t>(n, std::numeric_limits<T>::max());
+template <typename T>
+void TestTransformUnaryCountingIterator()
+{
+    size_t const n = 15 * sizeof(T);
+
+    ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
 
     thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
     thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
@@ -761,20 +771,26 @@ template <class T>
     thrust::transform(d_first, d_first + n, d_result.begin(), thrust::identity<T>());
 
     ASSERT_EQUAL(h_result, d_result);
-#endif
 }
-DECLARE_VARIABLE_UNITTEST(TestTransformUnaryCountingIterator);
+#endif
+DECLARE_GENERIC_UNITTEST(TestTransformUnaryCountingIterator);
 
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400
 template <typename T>
-  void TestTransformBinaryCountingIterator(size_t n)
+void TestTransformBinaryCountingIterator()
 {
     // GCC 4.4.x has a known failure with auto-vectorization (due to -O3 or -ftree-vectorize) of this test
     // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43251
-#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100) == 40400
+
     KNOWN_FAILURE;
+}
 #else
-    // be careful not to generate a range larger than we can represent
-    n = thrust::min<size_t>(n, std::numeric_limits<T>::max());
+template <typename T>
+void TestTransformBinaryCountingIterator()
+{
+    size_t const n = 15 * sizeof(T);
+
+    ASSERT_LEQUAL(T(n), unittest::truncate_to_max_representable<T>(n));
 
     thrust::counting_iterator<T, thrust::host_system_tag>   h_first = thrust::make_counting_iterator<T>(0);
     thrust::counting_iterator<T, thrust::device_system_tag> d_first = thrust::make_counting_iterator<T>(0);
@@ -786,9 +802,9 @@ template <typename T>
     thrust::transform(d_first, d_first + n, d_first, d_result.begin(), thrust::plus<T>());
 
     ASSERT_EQUAL(h_result, d_result);
-#endif
 }
-DECLARE_VARIABLE_UNITTEST(TestTransformBinaryCountingIterator);
+#endif
+DECLARE_GENERIC_UNITTEST(TestTransformBinaryCountingIterator);
 
 
 template <typename T>
@@ -814,7 +830,7 @@ void TestTransformWithIndirection(void)
     Vector input1(7);
     Vector input2(7);
     Vector output(7, 0);
-    input1[0] = 0;  input2[0] = 2; 
+    input1[0] = 0;  input2[0] = 2;
     input1[1] = 1;  input2[1] = 2;
     input1[2] = 2;  input2[2] = 2;
     input1[3] = 1;  input2[3] = 0;
@@ -831,10 +847,10 @@ void TestTransformWithIndirection(void)
     table[5] = 2;
 
     thrust::transform(input1.begin(), input1.end(),
-                      input2.begin(), 
+                      input2.begin(),
                       output.begin(),
                       plus_mod3<T>(thrust::raw_pointer_cast(&table[0])));
-    
+
     ASSERT_EQUAL(output[0], T(2));
     ASSERT_EQUAL(output[1], T(0));
     ASSERT_EQUAL(output[2], T(1));
@@ -843,5 +859,5 @@ void TestTransformWithIndirection(void)
     ASSERT_EQUAL(output[5], T(1));
     ASSERT_EQUAL(output[6], T(1));
 }
-DECLARE_VECTOR_UNITTEST(TestTransformWithIndirection);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformWithIndirection);
 
diff --git a/testing/transform_input_output_iterator.cu b/testing/transform_input_output_iterator.cu
new file mode 100644
index 000000000..7df163077
--- /dev/null
+++ b/testing/transform_input_output_iterator.cu
@@ -0,0 +1,122 @@
+#include <unittest/unittest.h>
+#include <thrust/iterator/transform_input_output_iterator.h>
+
+#include <thrust/copy.h>
+#include <thrust/reduce.h>
+#include <thrust/functional.h>
+#include <thrust/sequence.h>
+#include <thrust/iterator/counting_iterator.h>
+
+template <class Vector>
+void TestTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector squared(4);
+    Vector negated(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    // construct transform_iterator
+    thrust::transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+        transform_iter(squared.begin(), InputFunction(), OutputFunction());
+
+    // transform_iter writes squared value
+    thrust::copy(input.begin(), input.end(), transform_iter);
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+    // negated value read from transform_iter
+    thrust::copy_n(transform_iter, squared.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -4;
+    gold_negated[2] = -9;
+    gold_negated[3] = -16;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformInputOutputIterator);
+
+template <class Vector>
+void TestMakeTransformInputOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::negate<T> InputFunction;
+    typedef thrust::square<T> OutputFunction;
+
+    Vector input(4);
+    Vector negated(4);
+    Vector squared(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+
+    // negated value read from transform iterator
+    thrust::copy_n(thrust::make_transform_input_output_iterator(input.begin(), InputFunction(), OutputFunction()),
+                   input.size(), negated.begin());
+
+    Vector gold_negated(4);
+    gold_negated[0] = -1;
+    gold_negated[1] = -2;
+    gold_negated[2] = -3;
+    gold_negated[3] = -4;
+
+    ASSERT_EQUAL(negated, gold_negated);
+
+    // squared value writen by transform iterator
+    thrust::copy(negated.begin(), negated.end(),
+                 thrust::make_transform_input_output_iterator(squared.begin(), InputFunction(), OutputFunction()));
+
+    Vector gold_squared(4);
+    gold_squared[0] = 1;
+    gold_squared[1] = 4;
+    gold_squared[2] = 9;
+    gold_squared[3] = 16;
+
+    ASSERT_EQUAL(squared, gold_squared);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformInputOutputIterator);
+
+template <typename T>
+struct TestTransformInputOutputIteratorScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        // run on host (uses forward iterator negate)
+        thrust::inclusive_scan(thrust::make_transform_input_output_iterator(h_data.begin(), thrust::negate<T>(), thrust::identity<T>()),
+                               thrust::make_transform_input_output_iterator(h_data.end(),   thrust::negate<T>(), thrust::identity<T>()),
+                               h_result.begin());
+        // run on device (uses reverse iterator negate)
+        thrust::inclusive_scan(d_data.begin(), d_data.end(),
+                               thrust::make_transform_input_output_iterator(
+                                   d_result.begin(), thrust::square<T>(), thrust::negate<T>()));
+
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformInputOutputIteratorScan, IntegralTypes> TestTransformInputOutputIteratorScanInstance;
+
diff --git a/testing/transform_iterator.cu b/testing/transform_iterator.cu
index e28e333e1..a960a0b44 100644
--- a/testing/transform_iterator.cu
+++ b/testing/transform_iterator.cu
@@ -7,6 +7,8 @@
 #include <thrust/sequence.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <memory>
+
 template <class Vector>
 void TestTransformIterator(void)
 {
@@ -84,3 +86,28 @@ struct TestTransformIteratorReduce
 };
 VariableUnitTest<TestTransformIteratorReduce, IntegralTypes> TestTransformIteratorReduceInstance;
 
+
+struct ExtractValue{
+    int operator()(std::unique_ptr<int> const& n){
+        return *n;
+    }
+};
+
+void TestTransformIteratorNonCopyable(){
+
+    thrust::host_vector<std::unique_ptr<int>> hv(4);
+    hv[0].reset(new int{1});
+    hv[1].reset(new int{2});
+    hv[2].reset(new int{3});
+    hv[3].reset(new int{4});
+
+    auto transformed = thrust::make_transform_iterator(hv.begin(), ExtractValue{});
+    ASSERT_EQUAL(transformed[0], 1);
+    ASSERT_EQUAL(transformed[1], 2);
+    ASSERT_EQUAL(transformed[2], 3);
+    ASSERT_EQUAL(transformed[3], 4);
+
+}
+
+DECLARE_UNITTEST(TestTransformIteratorNonCopyable);
+
diff --git a/testing/transform_output_iterator.cu b/testing/transform_output_iterator.cu
new file mode 100644
index 000000000..27f8b53bd
--- /dev/null
+++ b/testing/transform_output_iterator.cu
@@ -0,0 +1,93 @@
+#include <unittest/unittest.h>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+
+template <class Vector>
+void TestTransformOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::square<T> UnaryFunction;
+    typedef typename Vector::iterator Iterator;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), T{1});
+   
+    // construct transform_iterator
+    thrust::transform_output_iterator<UnaryFunction, Iterator> output_iter(output.begin(), UnaryFunction());
+
+    thrust::copy(input.begin(), input.end(), output_iter);
+
+    Vector gold_output(4);
+    gold_output[0] = 1;
+    gold_output[1] = 4;
+    gold_output[2] = 9;
+    gold_output[3] = 16;
+
+    ASSERT_EQUAL(output, gold_output);
+
+}
+DECLARE_VECTOR_UNITTEST(TestTransformOutputIterator);
+
+template <class Vector>
+void TestMakeTransformOutputIterator(void)
+{
+    typedef typename Vector::value_type T;
+
+    typedef thrust::square<T> UnaryFunction;
+
+    Vector input(4);
+    Vector output(4);
+    
+    // initialize input
+    thrust::sequence(input.begin(), input.end(), 1);
+   
+    thrust::copy(input.begin(), input.end(),
+                 thrust::make_transform_output_iterator(output.begin(), UnaryFunction()));
+
+    Vector gold_output(4);
+    gold_output[0] = 1;
+    gold_output[1] = 4;
+    gold_output[2] = 9;
+    gold_output[3] = 16;
+    ASSERT_EQUAL(output, gold_output);
+
+}
+DECLARE_VECTOR_UNITTEST(TestMakeTransformOutputIterator);
+
+template <typename T>
+struct TestTransformOutputIteratorScan
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T>   h_data = unittest::random_samples<T>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        thrust::host_vector<T>   h_result(n);
+        thrust::device_vector<T> d_result(n);
+
+        // run on host
+        thrust::inclusive_scan(thrust::make_transform_iterator(h_data.begin(), thrust::negate<T>()),
+                               thrust::make_transform_iterator(h_data.end(),   thrust::negate<T>()),
+                               h_result.begin());
+        // run on device
+        thrust::inclusive_scan(d_data.begin(), d_data.end(),
+                               thrust::make_transform_output_iterator(
+                                   d_result.begin(), thrust::negate<T>()));
+
+
+        ASSERT_EQUAL(h_result, d_result);
+    }
+};
+VariableUnitTest<TestTransformOutputIteratorScan, SignedIntegralTypes> TestTransformOutputIteratorScanInstance;
+
diff --git a/testing/transform_output_iterator_reduce_by_key.cu b/testing/transform_output_iterator_reduce_by_key.cu
new file mode 100644
index 000000000..f7004f8c7
--- /dev/null
+++ b/testing/transform_output_iterator_reduce_by_key.cu
@@ -0,0 +1,51 @@
+#include <unittest/unittest.h>
+
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+
+template <typename T>
+struct TestTransformOutputIteratorReduceByKey
+{
+  void operator()(const size_t n)
+  {
+    thrust::host_vector<T> h_keys = unittest::random_samples<T>(n);
+    thrust::sort(h_keys.begin(), h_keys.end());
+    thrust::device_vector<T> d_keys = h_keys;
+
+    thrust::host_vector<T> h_values   = unittest::random_samples<T>(n);
+    thrust::device_vector<T> d_values = h_values;
+
+    thrust::host_vector<T> h_result(n);
+    thrust::device_vector<T> d_result(n);
+
+    // run on host
+    thrust::reduce_by_key(thrust::host,
+                          h_keys.begin(),
+                          h_keys.end(),
+                          thrust::make_transform_iterator(h_values.begin(), thrust::negate<T>()),
+                          thrust::discard_iterator<T>{},
+                          h_result.begin());
+    // run on device
+    thrust::reduce_by_key(thrust::device,
+                          d_keys.begin(),
+                          d_keys.end(),
+                          d_values.begin(),
+                          thrust::discard_iterator<T>{},
+                          thrust::make_transform_output_iterator(d_result.begin(),
+                                                                 thrust::negate<T>()));
+
+    ASSERT_EQUAL(h_result, d_result);
+  }
+};
+VariableUnitTest<TestTransformOutputIteratorReduceByKey, SignedIntegralTypes>
+  TestTransformOutputIteratorReduceByKeyInstance;
+
diff --git a/testing/transform_reduce.cu b/testing/transform_reduce.cu
index 945dc8d0d..3ff3159d6 100644
--- a/testing/transform_reduce.cu
+++ b/testing/transform_reduce.cu
@@ -124,5 +124,5 @@ void TestTransformReduceCountingIterator(void)
 
     ASSERT_EQUAL(result, -6);
 }
-DECLARE_VECTOR_UNITTEST(TestTransformReduceCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformReduceCountingIterator);
 
diff --git a/testing/transform_scan.cu b/testing/transform_scan.cu
index fe24c2286..2b6e35a2a 100644
--- a/testing/transform_scan.cu
+++ b/testing/transform_scan.cu
@@ -156,21 +156,21 @@ void TestTransformScanSimple(void)
     // inclusive scan
     iter = thrust::transform_inclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), thrust::plus<T>());
     result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
     // exclusive scan with 0 init
     iter = thrust::transform_exclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), 0, thrust::plus<T>());
     result[0] = 0; result[1] = -1; result[2] = -4; result[3] = -2; result[4] = -6;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
     // exclusive scan with nonzero init
     iter = thrust::transform_exclusive_scan(input.begin(), input.end(), output.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
     result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
-    ASSERT_EQUAL(iter - output.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - output.begin()), input.size());
     ASSERT_EQUAL(input,  input_copy);
     ASSERT_EQUAL(output, result);
     
@@ -178,18 +178,73 @@ void TestTransformScanSimple(void)
     input = input_copy;
     iter = thrust::transform_inclusive_scan(input.begin(), input.end(), input.begin(), thrust::negate<T>(), thrust::plus<T>());
     result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
-    ASSERT_EQUAL(iter - input.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
     ASSERT_EQUAL(input, result);
 
     // inplace exclusive scan with init
     input = input_copy;
     iter = thrust::transform_exclusive_scan(input.begin(), input.end(), input.begin(), thrust::negate<T>(), 3, thrust::plus<T>());
     result[0] = 3; result[1] = 2; result[2] = -1; result[3] = 1; result[4] = -3;
-    ASSERT_EQUAL(iter - input.begin(), input.size());
+    ASSERT_EQUAL(std::size_t(iter - input.begin()), input.size());
     ASSERT_EQUAL(input, result);
 }
-DECLARE_VECTOR_UNITTEST(TestTransformScanSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanSimple);
 
+struct Record {
+    int number;
+
+    bool operator==(const Record& rhs) const {
+        return number == rhs.number;
+    }
+    bool operator!=(const Record& rhs) const {
+        return !(rhs == *this);
+    }
+    friend Record operator+(Record lhs, const Record& rhs) {
+        lhs.number += rhs.number;
+        return lhs;
+    }
+    friend std::ostream& operator<<(std::ostream& os, const Record& record) {
+        os << "number: " << record.number;
+        return os;
+    }
+};
+
+struct negate {
+    __host__ __device__ int operator()(Record const& record) const
+    {
+        return - record.number;
+    }
+};
+
+void TestTransformInclusiveScanDifferentTypes()
+{
+    typename thrust::host_vector<int>::iterator h_iter;
+
+    thrust::host_vector<Record> h_input(5);
+    thrust::host_vector<int> h_output(5);
+    thrust::host_vector<int> result(5);
+
+    h_input[0] = {1}; h_input[1] = {3}; h_input[2] = {-2}; h_input[3] = {4}; h_input[4] = {-5};
+
+    thrust::host_vector<Record> input_copy(h_input);
+
+    h_iter = thrust::transform_inclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), negate{}, thrust::plus<int>{});
+    result[0] = -1; result[1] = -4; result[2] = -2; result[3] = -6; result[4] = -1;
+    ASSERT_EQUAL(std::size_t(h_iter - h_output.begin()), h_input.size());
+    ASSERT_EQUAL(h_input, input_copy);
+    ASSERT_EQUAL(h_output, result);
+
+    typename thrust::device_vector<int>::iterator d_iter;
+
+    thrust::device_vector<Record> d_input = h_input;
+    thrust::device_vector<int> d_output(5);
+
+    d_iter = thrust::transform_inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), negate{}, thrust::plus<int>{});
+    ASSERT_EQUAL(std::size_t(d_iter - d_output.begin()), d_input.size());
+    ASSERT_EQUAL(d_input, input_copy);
+    ASSERT_EQUAL(d_output, result);
+}
+DECLARE_UNITTEST(TestTransformInclusiveScanDifferentTypes);
 
 template <typename T>
 struct TestTransformScan
@@ -242,7 +297,7 @@ void TestTransformScanCountingIterator(void)
     ASSERT_EQUAL(result[1], -3);
     ASSERT_EQUAL(result[2], -6);
 }
-DECLARE_VECTOR_UNITTEST(TestTransformScanCountingIterator);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestTransformScanCountingIterator);
 
 template <typename T>
 struct TestTransformScanToDiscardIterator
@@ -292,3 +347,55 @@ struct TestTransformScanToDiscardIterator
 };
 VariableUnitTest<TestTransformScanToDiscardIterator, IntegralTypes> TestTransformScanToDiscardIteratorInstance;
 
+// Regression test for https://github.com/NVIDIA/thrust/issues/1332
+// The issue was the internal transform_input_iterator_t created by the
+// transform_inclusive_scan implementation was instantiated using a reference
+// type for the value_type.
+template <typename T>
+void TestValueCategoryDeduction()
+{
+    thrust::device_vector<T> vec;
+
+    T a_h[10] = {5, 0, 5, 8, 6, 7, 5, 3, 0, 9};
+    vec.assign((T*)a_h, a_h + 10);
+
+
+    thrust::transform_inclusive_scan(thrust::device,
+                                     vec.cbegin(),
+                                     vec.cend(),
+                                     vec.begin(),
+                                     thrust::identity<>{},
+                                     thrust::maximum<>{});
+
+    ASSERT_EQUAL(T{5}, vec[0]);
+    ASSERT_EQUAL(T{5}, vec[1]);
+    ASSERT_EQUAL(T{5}, vec[2]);
+    ASSERT_EQUAL(T{8}, vec[3]);
+    ASSERT_EQUAL(T{8}, vec[4]);
+    ASSERT_EQUAL(T{8}, vec[5]);
+    ASSERT_EQUAL(T{8}, vec[6]);
+    ASSERT_EQUAL(T{8}, vec[7]);
+    ASSERT_EQUAL(T{8}, vec[8]);
+    ASSERT_EQUAL(T{9}, vec[9]);
+
+    vec.assign((T*)a_h, a_h + 10);
+    thrust::transform_exclusive_scan(thrust::device,
+                                     vec.cbegin(),
+                                     vec.cend(),
+                                     vec.begin(),
+                                     thrust::identity<>{},
+                                     T{},
+                                     thrust::maximum<>{});
+
+    ASSERT_EQUAL(T{0}, vec[0]);
+    ASSERT_EQUAL(T{5}, vec[1]);
+    ASSERT_EQUAL(T{5}, vec[2]);
+    ASSERT_EQUAL(T{5}, vec[3]);
+    ASSERT_EQUAL(T{8}, vec[4]);
+    ASSERT_EQUAL(T{8}, vec[5]);
+    ASSERT_EQUAL(T{8}, vec[6]);
+    ASSERT_EQUAL(T{8}, vec[7]);
+    ASSERT_EQUAL(T{8}, vec[8]);
+    ASSERT_EQUAL(T{8}, vec[9]);
+}
+DECLARE_GENERIC_UNITTEST(TestValueCategoryDeduction);
diff --git a/testing/trivial_sequence.cu b/testing/trivial_sequence.cu
index 1458f59b0..6dee8e5ef 100644
--- a/testing/trivial_sequence.cu
+++ b/testing/trivial_sequence.cu
@@ -1,5 +1,6 @@
 #include <unittest/unittest.h>
 #include <thrust/detail/trivial_sequence.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
 #include <thrust/iterator/zip_iterator.h> 
 
@@ -25,8 +26,8 @@ void test(Iterator first, Iterator last)
 
     typedef typename thrust::detail::trivial_sequence<Iterator,System>::iterator_type TrivialIterator;
 
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<Iterator>::value,        false);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<TrivialIterator>::value,  true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<Iterator>::value,        false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<TrivialIterator>::value,  true);
 }
 
 template <class Vector>
diff --git a/testing/trivial_tests/.gitignore b/testing/trivial_tests/.gitignore
deleted file mode 100644
index 3197c98a4..000000000
--- a/testing/trivial_tests/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-*.cu
-*.cpp
-
diff --git a/testing/trivial_tests/SConscript b/testing/trivial_tests/SConscript
deleted file mode 100644
index c216981cc..000000000
--- a/testing/trivial_tests/SConscript
+++ /dev/null
@@ -1,88 +0,0 @@
-import os
-from time import sleep
-from warnings import warn
-
-Import('env')
-my_env = env.Clone()
-
-thrust_abspath = os.path.abspath("../../thrust/")
-
-# this function builds a trivial source file from a Thrust header
-def trivial_source_from_header(source, target, env):
-  target_filename = str(target[0])
-  fid = open(target_filename, 'w')
-
-  # make sure we don't trip over <windows.h> when compiling with cl.exe
-  if my_env.subst('$CXX') == 'cl':
-    fid.write('#include <windows.h>\n')
-
-  for src in source:
-    fid.write('#include <' + str(src) + '>\n')
-  fid.close()
-
-  # XXX WAR race condition on Windows discussed here:
-  #         http://scons.tigris.org/ds/viewMessage.do?dsForumId=1272&dsMessageId=807348
-  if os.name == 'nt':
-    sleep(0.1)
-
-
-# CUFile builds a trivial .cu file from a Thrust header
-cu_from_header_builder = Builder(action = trivial_source_from_header,
-                                 suffix = '.cu',
-                                 src_suffix = '.h')
-my_env.Append(BUILDERS = {'CUFile' : cu_from_header_builder})
-
-# CPPFile builds a trivial .cpp file from a Thrust header
-cpp_from_header_builder = Builder(action = trivial_source_from_header,
-                                  suffix = '.cpp',
-                                  src_suffix = '.h')
-my_env.Append(BUILDERS = {'CPPFile' : cpp_from_header_builder})
-
-# gather all public thrust headers
-public_thrust_headers = my_env.RecursiveGlob('*.h', '#thrust', exclude='detail|system')
-
-# omit headers from systems which are not the host or device system
-public_thrust_headers.extend(my_env.Glob('*.h', '#thrust/system'))
-public_thrust_headers.extend(my_env.RecursiveGlob('*.h', '#thrust/system/' + env['host_backend'], exclude='detail'))
-if env['device_backend'] != env['host_backend']:
-  public_thrust_headers.extend(my_env.RecursiveGlob('*.h', '#thrust/system/' + env['device_backend'], exclude='detail')) 
-
-sources = []
-
-for hdr in public_thrust_headers:
-  rel_path = Dir('#thrust').rel_path(hdr)
-  
-  # replace slashes with '_slash_'
-  src_filename = rel_path.replace('/', '_slash_').replace('\\', '_slash_')
-
-  cu  = my_env.CUFile(src_filename.replace('.h', '.cu'), hdr)
-  cpp = my_env.CPPFile(src_filename.replace('.h', '_cpp.cpp'), hdr)
-
-  sources.extend([cu,cpp])
-
-  # ensure that all files #include <thrust/detail/config.h>
-  if '#include <thrust/detail/config.h>' not in hdr.get_contents():
-    warn('Header ' + str(hdr) + ' does not include <thrust/detail/config.h>')
-
-# generate source files which #include all headers
-all_headers_cu  = my_env.CUFile('all_headers.cu', public_thrust_headers)
-all_headers_cpp = my_env.CUFile('all_headers_cpp.cpp', public_thrust_headers)
-
-sources.append(all_headers_cu)
-sources.append(all_headers_cpp)
-
-# and the file with main()
-sources.append('main.cu')
-
-# build the tester
-tester = my_env.Program('tester', sources)
-
-# add the tester to the 'run_trivial_tests' alias
-tester_alias = my_env.Alias('run_trivial_tests', [tester], tester[0].abspath)
-
-# always build the 'run_trivial_tests' target whether or not it needs it
-my_env.AlwaysBuild(tester_alias)
-
-# add the trivial tests alias to the 'run_tests' alias
-my_env.Alias('run_tests', [tester], tester[0].abspath)
-
diff --git a/testing/trivial_tests/main.cu b/testing/trivial_tests/main.cu
deleted file mode 100644
index 5ab8d0fce..000000000
--- a/testing/trivial_tests/main.cu
+++ /dev/null
@@ -1 +0,0 @@
-int main(void){ return 0; }
diff --git a/testing/tuple.cu b/testing/tuple.cu
index ba7c82923..40dccbd22 100644
--- a/testing/tuple.cu
+++ b/testing/tuple.cu
@@ -90,7 +90,7 @@ struct TestTupleConstructor
     ASSERT_EQUAL(data[9], get<9>(t10));
   }
 };
-SimpleUnitTest<TestTupleConstructor, NumericTypes> TestTupleConstructorInstance;
+SimpleUnitTest<TestTupleConstructor, BuiltinNumericTypes> TestTupleConstructorInstance;
 
 template <typename T>
 struct TestMakeTuple
@@ -177,7 +177,7 @@ struct TestMakeTuple
     ASSERT_EQUAL(data[9], get<9>(t10));
   }
 };
-SimpleUnitTest<TestMakeTuple, NumericTypes> TestMakeTupleInstance;
+SimpleUnitTest<TestMakeTuple, BuiltinNumericTypes> TestMakeTupleInstance;
 
 template <typename T>
 struct TestTupleGet
@@ -263,7 +263,7 @@ struct TestTupleGet
     ASSERT_EQUAL(data[9], thrust::get<9>(t10));
   }
 };
-SimpleUnitTest<TestTupleGet, NumericTypes> TestTupleGetInstance;
+SimpleUnitTest<TestTupleGet, BuiltinNumericTypes> TestTupleGetInstance;
 
 
 
@@ -342,38 +342,41 @@ struct TestTupleTieFunctor
     T data[10];
     clear(data);
 
-    tie(data[0]) = make_tuple(0);;
-    result &= data[0] == 0;
+    // 17 and not 0 to avoid triggering custom_numeric's `operator void *` and a comparison with a null pointer
+    // TODO: get this back from 17 to 0 once C++11 is on everywhere and that operator on custom_numeric is changed
+    // to an explicit operator bool
+    tie(data[0]) = make_tuple(17);
+    result &= data[0] == 17;
     clear(data);
 
-    tie(data[0], data[1]) = make_tuple(0,1);
-    result &= data[0] == 0;
+    tie(data[0], data[1]) = make_tuple(17,1);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     clear(data);
 
-    tie(data[0], data[1], data[2]) = make_tuple(0,1,2);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2]) = make_tuple(17,1,2);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3]) = make_tuple(0,1,2,3);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3]) = make_tuple(17,1,2,3);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4]) = make_tuple(0,1,2,3,4);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4]) = make_tuple(17,1,2,3,4);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
     result &= data[4] == 4;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4], data[5]) = make_tuple(0,1,2,3,4,5);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4], data[5]) = make_tuple(17,1,2,3,4,5);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
@@ -381,8 +384,8 @@ struct TestTupleTieFunctor
     result &= data[5] == 5;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6]) = make_tuple(0,1,2,3,4,5,6);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6]) = make_tuple(17,1,2,3,4,5,6);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
@@ -391,8 +394,8 @@ struct TestTupleTieFunctor
     result &= data[6] == 6;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]) = make_tuple(0,1,2,3,4,5,6,7);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]) = make_tuple(17,1,2,3,4,5,6,7);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
@@ -402,8 +405,8 @@ struct TestTupleTieFunctor
     result &= data[7] == 7;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]) = make_tuple(0,1,2,3,4,5,6,7,8);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8]) = make_tuple(17,1,2,3,4,5,6,7,8);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
@@ -414,8 +417,8 @@ struct TestTupleTieFunctor
     result &= data[8] == 8;
     clear(data);
 
-    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]) = make_tuple(0,1,2,3,4,5,6,7,8,9);
-    result &= data[0] == 0;
+    tie(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7], data[8], data[9]) = make_tuple(17,1,2,3,4,5,6,7,8,9);
+    result &= data[0] == 17;
     result &= data[1] == 1;
     result &= data[2] == 2;
     result &= data[3] == 3;
diff --git a/testing/tuple_algorithms.cu b/testing/tuple_algorithms.cu
new file mode 100644
index 000000000..449fdc2f1
--- /dev/null
+++ b/testing/tuple_algorithms.cu
@@ -0,0 +1,62 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <unittest/unittest.h>
+
+#include <thrust/detail/tuple_algorithms.h>
+#include <thrust/type_traits/integer_sequence.h>
+
+// FIXME: Replace with C++14 style `thrust::square<>` when we have it.
+struct custom_square
+{
+  template <typename T>
+  __host__ __device__
+  T operator()(T v) const
+  {
+    return v * v;
+  }
+};
+
+struct custom_square_inplace
+{
+  template <typename T>
+  __host__ __device__
+  void operator()(T& v) const
+  {
+    v *= v;
+  }
+};
+
+void test_tuple_subset()
+{
+  auto t0 = std::make_tuple(0, 2, 3.14);
+
+  auto t1 = thrust::tuple_subset(t0, thrust::index_sequence<2, 0>{});
+
+  ASSERT_EQUAL_QUIET(t1, std::make_tuple(3.14, 0));
+}
+DECLARE_UNITTEST(test_tuple_subset);
+
+void test_tuple_transform()
+{
+  auto t0 = std::make_tuple(0, 2, 3.14);
+
+  auto t1 = thrust::tuple_transform(t0, custom_square{});
+
+  ASSERT_EQUAL_QUIET(t1, std::make_tuple(0, 4, 9.8596));
+}
+DECLARE_UNITTEST(test_tuple_transform);
+
+void test_tuple_for_each()
+{
+  auto t = std::make_tuple(0, 2, 3.14);
+
+  thrust::tuple_for_each(t, custom_square_inplace{});
+
+  ASSERT_EQUAL_QUIET(t, std::make_tuple(0, 4, 9.8596));
+}
+DECLARE_UNITTEST(test_tuple_for_each);
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/testing/tuple_scan.cu b/testing/tuple_scan.cu
index e847a4362..d0565d6d4 100644
--- a/testing/tuple_scan.cu
+++ b/testing/tuple_scan.cu
@@ -4,7 +4,7 @@
 #include <thrust/transform.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include <backend/cuda/testframework.h>
+#include <unittest/cuda/testframework.h>
 #endif
 
 using namespace unittest;
@@ -58,18 +58,6 @@ struct TestTupleScan
      inclusive_scan(d_input.begin(), d_input.end(), d_output.begin(), SumTupleFunctor());
      ASSERT_EQUAL_QUIET(h_output, d_output);
 
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
      // exclusive_scan
      tuple<T,T> init(13,17);
      exclusive_scan(h_input.begin(), h_input.end(), h_output.begin(), init, SumTupleFunctor());
diff --git a/testing/type_traits.cu b/testing/type_traits.cu
index bfbd128e0..339e11b90 100644
--- a/testing/type_traits.cu
+++ b/testing/type_traits.cu
@@ -1,6 +1,7 @@
 #include <unittest/unittest.h>
 
 #include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/device_ptr.h>
 
@@ -64,22 +65,22 @@ void TestIsPlainOldData(void)
 }
 DECLARE_UNITTEST(TestIsPlainOldData);
 
-void TestIsTrivialIterator(void)
+void TestIsContiguousIterator(void)
 {
     typedef thrust::host_vector<int>   HostVector;
     typedef thrust::device_vector<int> DeviceVector;
     
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator< int * >::value, true);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator< thrust::device_ptr<int> >::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< int * >::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< thrust::device_ptr<int> >::value, true);
 
 
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<HostVector::iterator>::value, true);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<HostVector::const_iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<HostVector::iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<HostVector::const_iterator>::value, true);
 
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<DeviceVector::iterator>::value, true);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<DeviceVector::const_iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<DeviceVector::iterator>::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<DeviceVector::const_iterator>::value, true);
 
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator< thrust::device_ptr<int> >::value, true);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator< thrust::device_ptr<int> >::value, true);
 
     typedef thrust::tuple< HostVector::iterator,   HostVector::iterator   > HostIteratorTuple;
 
@@ -88,13 +89,13 @@ void TestIsTrivialIterator(void)
     typedef thrust::transform_iterator<thrust::identity<int>, HostVector::iterator > TransformIterator;
     typedef thrust::zip_iterator< HostIteratorTuple >  ZipIterator;
 
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<ConstantIterator>::value,  false);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<CountingIterator>::value,  false);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<TransformIterator>::value, false);
-    ASSERT_EQUAL((bool) thrust::detail::is_trivial_iterator<ZipIterator>::value,       false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<ConstantIterator>::value,  false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<CountingIterator>::value,  false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<TransformIterator>::value, false);
+    ASSERT_EQUAL((bool) thrust::is_contiguous_iterator<ZipIterator>::value,       false);
 
 }
-DECLARE_UNITTEST(TestIsTrivialIterator);
+DECLARE_UNITTEST(TestIsContiguousIterator);
 
 void TestIsCommutative(void)
 {
diff --git a/testing/uninitialized_copy.cu b/testing/uninitialized_copy.cu
index 83070d2f1..62a79cdc9 100644
--- a/testing/uninitialized_copy.cu
+++ b/testing/uninitialized_copy.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/iterator/retag.h>
 
+#include <nv/target>
 
 template<typename InputIterator, typename ForwardIterator>
 ForwardIterator uninitialized_copy(my_system &system,
@@ -103,8 +104,6 @@ DECLARE_UNITTEST(TestUninitializedCopyNDispatchImplicit);
 template <class Vector>
 void TestUninitializedCopySimplePOD(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v1(5);
     v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
 
@@ -123,8 +122,6 @@ DECLARE_VECTOR_UNITTEST(TestUninitializedCopySimplePOD);
 template<typename Vector>
 void TestUninitializedCopyNSimplePOD(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v1(5);
     v1[0] = 0; v1[1] = 1; v1[2] = 2; v1[3] = 3; v1[4] = 4;
 
@@ -149,17 +146,18 @@ struct CopyConstructTest
   {}
 
   __host__ __device__
-  CopyConstructTest(const CopyConstructTest &exemplar)
+  CopyConstructTest(const CopyConstructTest &)
   {
-#if __CUDA_ARCH__
-    copy_constructed_on_device = true;
-    copy_constructed_on_host   = false;
-#else
-    copy_constructed_on_device = false;
-    copy_constructed_on_device = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      copy_constructed_on_device = true;
+      copy_constructed_on_host   = false;
+    ), (
+      copy_constructed_on_device = false;
+      copy_constructed_on_host = true;
+    ));
   }
 
+  __host__ __device__
   CopyConstructTest &operator=(const CopyConstructTest &x)
   {
     copy_constructed_on_host   = x.copy_constructed_on_host;
@@ -174,7 +172,7 @@ struct CopyConstructTest
 
 struct TestUninitializedCopyNonPODDevice
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
 
@@ -200,7 +198,7 @@ DECLARE_UNITTEST(TestUninitializedCopyNonPODDevice);
 
 struct TestUninitializedCopyNNonPODDevice
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
 
@@ -226,7 +224,7 @@ DECLARE_UNITTEST(TestUninitializedCopyNNonPODDevice);
 
 struct TestUninitializedCopyNonPODHost
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
 
@@ -252,7 +250,7 @@ DECLARE_UNITTEST(TestUninitializedCopyNonPODHost);
 
 struct TestUninitializedCopyNNonPODHost
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
 
diff --git a/testing/uninitialized_fill.cu b/testing/uninitialized_fill.cu
index 245de657f..8fbb97002 100644
--- a/testing/uninitialized_fill.cu
+++ b/testing/uninitialized_fill.cu
@@ -3,6 +3,7 @@
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/iterator/retag.h>
 
+#include <nv/target>
 
 template<typename ForwardIterator, typename T>
 void uninitialized_fill(my_system &system,
@@ -147,23 +148,25 @@ DECLARE_VECTOR_UNITTEST(TestUninitializedFillPOD);
 
 struct CopyConstructTest
 {
+  __host__ __device__
   CopyConstructTest(void)
     :copy_constructed_on_host(false),
      copy_constructed_on_device(false)
   {}
 
   __host__ __device__
-  CopyConstructTest(const CopyConstructTest &exemplar)
+  CopyConstructTest(const CopyConstructTest &)
   {
-#if __CUDA_ARCH__
-    copy_constructed_on_device = true;
-    copy_constructed_on_host   = false;
-#else
-    copy_constructed_on_device = false;
-    copy_constructed_on_host   = true;
-#endif
+    NV_IF_TARGET(NV_IS_DEVICE, (
+      copy_constructed_on_device = true;
+      copy_constructed_on_host   = false;
+    ), (
+      copy_constructed_on_device = false;
+      copy_constructed_on_host   = true;
+    ));
   }
 
+  __host__ __device__
   CopyConstructTest &operator=(const CopyConstructTest &x)
   {
     copy_constructed_on_host   = x.copy_constructed_on_host;
@@ -178,7 +181,7 @@ struct CopyConstructTest
 
 struct TestUninitializedFillNonPOD
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
     thrust::device_ptr<T> v = thrust::device_malloc<T>(5);
@@ -264,7 +267,7 @@ DECLARE_VECTOR_UNITTEST(TestUninitializedFillNPOD);
 
 struct TestUninitializedFillNNonPOD
 {
-  void operator()(const size_t dummy)
+  void operator()(const size_t)
   {
     typedef CopyConstructTest T;
     thrust::device_ptr<T> v = thrust::device_malloc<T>(5);
diff --git a/testing/unique.cu b/testing/unique.cu
index 793c9b39a..7df2def87 100644
--- a/testing/unique.cu
+++ b/testing/unique.cu
@@ -95,6 +95,50 @@ void TestUniqueCopyDispatchImplicit()
 DECLARE_UNITTEST(TestUniqueCopyDispatchImplicit);
 
 
+template <typename ForwardIterator>
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(my_system &system,
+                 ForwardIterator,
+                 ForwardIterator)
+{
+    system.validate_dispatch();
+    return 0;
+}
+
+void TestUniqueCountDispatchExplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    my_system sys(0);
+    thrust::unique_count(sys, vec.begin(), vec.begin());
+
+    ASSERT_EQUAL(true, sys.is_valid());
+}
+DECLARE_UNITTEST(TestUniqueCountDispatchExplicit);
+
+
+template <typename ForwardIterator>
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(my_tag,
+                 ForwardIterator,
+                 ForwardIterator)
+{
+    return 13;
+}
+
+void TestUniqueCountDispatchImplicit()
+{
+    thrust::device_vector<int> vec(1);
+
+    auto result = thrust::unique_count(
+        thrust::retag<my_tag>(vec.begin()),
+        thrust::retag<my_tag>(vec.begin()));
+
+    ASSERT_EQUAL(13, result);
+}
+DECLARE_UNITTEST(TestUniqueCountDispatchImplicit);
+
+
 template<typename T>
 struct is_equal_div_10_unique
 {
@@ -139,7 +183,7 @@ void TestUniqueSimple(void)
     ASSERT_EQUAL(data[1], 20);
     ASSERT_EQUAL(data[2], 31);
 }
-DECLARE_VECTOR_UNITTEST(TestUniqueSimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueSimple);
 
 
 template<typename T>
@@ -206,7 +250,7 @@ void TestUniqueCopySimple(void)
     ASSERT_EQUAL(data[1], 20);
     ASSERT_EQUAL(data[2], 31);
 }
-DECLARE_VECTOR_UNITTEST(TestUniqueCopySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCopySimple);
 
 
 template<typename T>
@@ -266,3 +310,48 @@ struct TestUniqueCopyToDiscardIterator
 VariableUnitTest<TestUniqueCopyToDiscardIterator, IntegralTypes> TestUniqueCopyToDiscardIteratorInstance;
 
 
+template <typename Vector>
+void TestUniqueCountSimple(void)
+{
+    typedef typename Vector::value_type T;
+
+    Vector data(10);
+    data[0] = 11;
+    data[1] = 11;
+    data[2] = 12;
+    data[3] = 20;
+    data[4] = 29;
+    data[5] = 21;
+    data[6] = 21;
+    data[7] = 31;
+    data[8] = 31;
+    data[9] = 37;
+
+    int count = thrust::unique_count(data.begin(), data.end());
+
+    ASSERT_EQUAL(count, 7);
+
+    int div_10_count = thrust::unique_count(data.begin(), data.end(), is_equal_div_10_unique<T>());
+
+    ASSERT_EQUAL(div_10_count, 3);
+}
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCountSimple);
+
+template <typename T>
+struct TestUniqueCount
+{
+    void operator()(const size_t n)
+    {
+        thrust::host_vector<T> h_data = unittest::random_integers<bool>(n);
+        thrust::device_vector<T> d_data = h_data;
+
+        int h_count{};
+        int d_count{};
+
+        h_count = thrust::unique_count(h_data.begin(), h_data.end());
+        d_count = thrust::unique_count(d_data.begin(), d_data.end());
+
+        ASSERT_EQUAL(h_count, d_count);
+    }
+};
+VariableUnitTest<TestUniqueCount, IntegralTypes> TestUniqueCountInstance;
diff --git a/testing/unique_by_key.cu b/testing/unique_by_key.cu
index 0266c6664..76073e0ca 100644
--- a/testing/unique_by_key.cu
+++ b/testing/unique_by_key.cu
@@ -200,7 +200,7 @@ void TestUniqueByKeySimple(void)
     ASSERT_EQUAL(values[1], 2);
     ASSERT_EQUAL(values[2], 7);
 }
-DECLARE_VECTOR_UNITTEST(TestUniqueByKeySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueByKeySimple);
 
 
 template<typename Vector>
@@ -250,7 +250,7 @@ void TestUniqueCopyByKeySimple(void)
     ASSERT_EQUAL(output_values[1], 2);
     ASSERT_EQUAL(output_values[2], 7);
 }
-DECLARE_VECTOR_UNITTEST(TestUniqueCopyByKeySimple);
+DECLARE_INTEGRAL_VECTOR_UNITTEST(TestUniqueCopyByKeySimple);
 
 
 template<typename K>
diff --git a/testing/unittest/CMakeLists.txt b/testing/unittest/CMakeLists.txt
new file mode 100644
index 000000000..4c0eb66cb
--- /dev/null
+++ b/testing/unittest/CMakeLists.txt
@@ -0,0 +1,24 @@
+foreach(thrust_target IN LISTS THRUST_TARGETS)
+  thrust_get_target_property(config_device ${thrust_target} DEVICE)
+  thrust_get_target_property(config_prefix ${thrust_target} PREFIX)
+
+  set(framework_target ${config_prefix}.test.framework)
+
+  if ("CUDA" STREQUAL "${config_device}")
+    set(framework_srcs
+      testframework.cu
+      cuda/testframework.cu
+    )
+  else()
+    # Wrap the cu file inside a .cpp file for non-CUDA builds
+    thrust_wrap_cu_in_cpp(framework_srcs testframework.cu ${thrust_target})
+  endif()
+
+  add_library(${framework_target} STATIC ${framework_srcs})
+  target_link_libraries(${framework_target} PUBLIC ${thrust_target})
+  target_include_directories(${framework_target} PRIVATE "${Thrust_SOURCE_DIR}/testing")
+  thrust_clone_target_properties(${framework_target} ${thrust_target})
+
+  thrust_fix_clang_nvcc_build_for(${framework_target})
+
+endforeach()
diff --git a/testing/unittest/assertions.h b/testing/unittest/assertions.h
index 0e9f308ca..855d705a4 100644
--- a/testing/unittest/assertions.h
+++ b/testing/unittest/assertions.h
@@ -3,47 +3,111 @@
 #include <thrust/complex.h>
 #include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/universal_vector.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 
 #include <unittest/exceptions.h>
 #include <unittest/util.h>
 
-#define ASSERT_EQUAL_QUIET(X,Y)  unittest::assert_equal_quiet((X),(Y), __FILE__, __LINE__)
-#define ASSERT_EQUAL(X,Y)        unittest::assert_equal((X),(Y), __FILE__,  __LINE__)
-#define ASSERT_LEQUAL(X,Y)       unittest::assert_lequal((X),(Y), __FILE__,  __LINE__)
-#define ASSERT_GEQUAL(X,Y)       unittest::assert_gequal((X),(Y), __FILE__,  __LINE__)
-#define ASSERT_ALMOST_EQUAL(X,Y) unittest::assert_almost_equal((X),(Y), __FILE__, __LINE__)
-#define KNOWN_FAILURE            { unittest::UnitTestKnownFailure f; f << "[" << __FILE__ ":" << __LINE__ << "]"; throw f;}
-                    
-#define ASSERT_EQUAL_RANGES(X,Y,Z)  unittest::assert_equal((X),(Y),(Z), __FILE__,  __LINE__)
-
-#define ASSERT_THROWS(X,Y)                                                         \
-    {   bool thrown = false; try { X; } catch (Y) { thrown = true; }                  \
-        if (!thrown) { unittest::UnitTestFailure f; f << "[" << __FILE__ << ":" << __LINE__ << "] did not throw " << #Y; throw f; } \
-    }
-
+#define ASSERT_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)           unittest::assert_equal((X),(Y), FILE_,  LINE_)
+#define ASSERT_EQUAL_QUIET_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)     unittest::assert_equal_quiet((X),(Y), FILE_, LINE_)
+#define ASSERT_NOT_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)       unittest::assert_not_equal((X),(Y), FILE_,  LINE_)
+#define ASSERT_NOT_EQUAL_QUIET_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_) unittest::assert_not_equal_quiet((X),(Y), FILE_, LINE_)
+#define ASSERT_LEQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)          unittest::assert_lequal((X),(Y), FILE_,  LINE_)
+#define ASSERT_GEQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)          unittest::assert_gequal((X),(Y), FILE_,  LINE_)
+#define ASSERT_LESS_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)            unittest::assert_less((X),(Y), FILE_,  LINE_)
+#define ASSERT_GREATER_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)         unittest::assert_greater((X),(Y), FILE_,  LINE_)
+#define ASSERT_ALMOST_EQUAL_WITH_FILE_AND_LINE(X,Y,FILE_,LINE_)    unittest::assert_almost_equal((X),(Y), FILE_, LINE_)
+#define ASSERT_EQUAL_RANGES_WITH_FILE_AND_LINE(X,Y,Z,FILE_,LINE_)  unittest::assert_equal((X),(Y),(Z), FILE_,  LINE_)
+
+#define ASSERT_THROWS_WITH_FILE_AND_LINE(                                     \
+  EXPR, EXCEPTION_TYPE, FILE_, LINE_                                          \
+)                                                                             \
+  {                                                                           \
+    unittest::threw_status THRUST_PP_CAT2(__s, LINE_)                         \
+      = unittest::did_not_throw;                                              \
+    try { EXPR; }                                                             \
+    catch (EXCEPTION_TYPE const&)                                             \
+    { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_right_type; }              \
+    catch (...)                                                               \
+    { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_wrong_type; }              \
+    unittest::check_assert_throws(                                            \
+      THRUST_PP_CAT2(__s, LINE_), THRUST_PP_STRINGIZE(EXCEPTION_TYPE)         \
+    , FILE_, LINE_                                                            \
+    );                                                                        \
+  }                                                                           \
+  /**/
+
+#define ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(                               \
+  EXPR, EXCEPTION_TYPE, VALUE, FILE_, LINE_                                   \
+)                                                                             \
+  {                                                                           \
+    unittest::threw_status THRUST_PP_CAT2(__s, LINE_)                         \
+      = unittest::did_not_throw;                                              \
+    try { EXPR; }                                                             \
+    catch (EXCEPTION_TYPE const& THRUST_PP_CAT2(__e, LINE_))                  \
+    {                                                                         \
+      if (VALUE == THRUST_PP_CAT2(__e, LINE_))                                \
+        THRUST_PP_CAT2(__s, LINE_)                                            \
+          = unittest::threw_right_type;                                       \
+      else                                                                    \
+        THRUST_PP_CAT2(__s, LINE_)                                            \
+          = unittest::threw_right_type_but_wrong_value;                       \
+    }                                                                         \
+    catch (...) { THRUST_PP_CAT2(__s, LINE_) = unittest::threw_wrong_type; }  \
+    unittest::check_assert_throws(                                            \
+      THRUST_PP_CAT2(__s, LINE_), THRUST_PP_STRINGIZE(EXCEPTION_TYPE)         \
+    , FILE_, LINE_                                                            \
+    );                                                                        \
+  }                                                                           \
+  /**/
+
+#define KNOWN_FAILURE_WITH_FILE_AND_LINE(FILE_, LINE_)                                  \
+  { unittest::UnitTestKnownFailure f; f << "[" << FILE_ ":" << LINE_ << "]"; throw f; } \
+  /**/
+
+#define ASSERT_EQUAL(X,Y)           ASSERT_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_EQUAL_QUIET(X,Y)     ASSERT_EQUAL_QUIET_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_NOT_EQUAL(X,Y)       ASSERT_NOT_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_NOT_EQUAL_QUIET(X,Y) ASSERT_NOT_EQUAL_QUIET_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_LEQUAL(X,Y)          ASSERT_LEQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_GEQUAL(X,Y)          ASSERT_GEQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_LESS(X,Y)            ASSERT_LESS_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_GREATER(X,Y)         ASSERT_GREATER_WITH_FILE_AND_LINE((X),(Y), __FILE__,  __LINE__)
+#define ASSERT_ALMOST_EQUAL(X,Y)    ASSERT_ALMOST_EQUAL_WITH_FILE_AND_LINE((X),(Y), __FILE__, __LINE__)
+#define ASSERT_EQUAL_RANGES(X,Y,Z)  ASSERT_EQUAL_WITH_FILE_AND_LINE((X),(Y),(Z), __FILE__,  __LINE__)
+
+#define ASSERT_THROWS(EXPR, EXCEPTION_TYPE)                                   \
+  ASSERT_THROWS_WITH_FILE_AND_LINE(EXPR, EXCEPTION_TYPE, __FILE__, __LINE__)  \
+  /**/
+
+#define ASSERT_THROWS_EQUAL(EXPR, EXCEPTION_TYPE, VALUE)                                  \
+  ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(EXPR, EXCEPTION_TYPE, VALUE, __FILE__, __LINE__) \
+  /**/
+
+#define KNOWN_FAILURE KNOWN_FAILURE_WITH_FILE_AND_LINE(__FILE__, __LINE__)
 
 namespace unittest
 {
 
-static size_t MAX_OUTPUT_LINES = 10;
+size_t const MAX_OUTPUT_LINES = 10;
 
-static double DEFAULT_RELATIVE_TOL = 1e-4;
-static double DEFAULT_ABSOLUTE_TOL = 1e-4;
+double const DEFAULT_RELATIVE_TOL = 1e-4;
+double const DEFAULT_ABSOLUTE_TOL = 1e-4;
 
 template<typename T>
   struct value_type
 {
-  typedef typename thrust::detail::remove_const<
-    typename thrust::detail::remove_reference<
+  typedef typename THRUST_NS_QUALIFIER::detail::remove_const<
+    typename THRUST_NS_QUALIFIER::detail::remove_reference<
       T
     >::type
   >::type type;
 };
 
 template<typename T>
-  struct value_type< thrust::device_reference<T> >
+  struct value_type< THRUST_NS_QUALIFIER::device_reference<T> >
 {
   typedef typename value_type<T>::type type;
 };
@@ -51,13 +115,10 @@ template<typename T>
 ////
 // check scalar values
 template <typename T1, typename T2>
-void assert_equal(const T1& a, const T2& b, 
+void assert_equal(T1 a, T2 b,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    // convert a & b to a's value_type to avoid warning upon comparison
-    typedef typename value_type<T1>::type T;
-
-    if(!(T(a) == T(b))){
+    if(!(a == b)){
         unittest::UnitTestFailure f;
         f << "[" << filename << ":" << lineno << "] ";
         f << "values are not equal: " << a << " " << b;
@@ -66,22 +127,125 @@ void assert_equal(const T1& a, const T2& b,
     }
 }
 
-// sometimes it's not possible to << a type
+void assert_equal(char a, char b,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a == b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are not equal: " << int(a) << " " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+// sometimes its not possible to << a type
 template <typename T1, typename T2>
-void assert_equal_quiet(const T1& a, const T2& b, 
+void assert_equal_quiet(const T1& a, const T2& b,
                         const std::string& filename = "unknown", int lineno = -1)
 {
     if(!(a == b)){
         unittest::UnitTestFailure f;
         f << "[" << filename << ":" << lineno << "] ";
-        f << "values are not equal.";
+        f << "values are not equal";
         f << " [type='" << type_name<T1>() << "']";
         throw f;
     }
 }
 
+////
+// check scalar values
 template <typename T1, typename T2>
-void assert_lequal(const T1& a, const T2& b, 
+void assert_not_equal(T1 a, T2 b,
+                      const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal: " << a << " " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_not_equal(char a, char b,
+                      const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal: " << int(a) << " " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+// sometimes its not possible to << a type
+template <typename T1, typename T2>
+void assert_not_equal_quiet(const T1& a, const T2& b,
+                            const std::string& filename = "unknown", int lineno = -1)
+{
+    if(a == b){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << "values are equal";
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_less(T1 a, T2 b,
+                 const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a < b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is greater or equal to " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_less(char a, char b,
+                 const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a < b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is greater than or equal to " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_greater(T1 a, T2 b,
+                    const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a > b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << a << " is less than or equal to " << b;
+        f << " [type='" << type_name<T1>() << "']";
+        throw f;
+    }
+}
+
+void assert_greater(char a, char b,
+                    const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a > b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is less than or equal to " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
+template <typename T1, typename T2>
+void assert_lequal(T1 a, T2 b,
                    const std::string& filename = "unknown", int lineno = -1)
 {
     if(!(a <= b)){
@@ -93,11 +257,23 @@ void assert_lequal(const T1& a, const T2& b,
     }
 }
 
+void assert_lequal(char a, char b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a <= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is greater than " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
 template <typename T1, typename T2>
-void assert_gequal(const T1& a, const T2& b, 
+void assert_gequal(T1 a, T2 b,
                    const std::string& filename = "unknown", int lineno = -1)
 {
-    if(!(a >= T1(b))){
+    if(!(a >= b)){
         unittest::UnitTestFailure f;
         f << "[" << filename << ":" << lineno << "] ";
         f << a << " is less than " << b;
@@ -106,6 +282,18 @@ void assert_gequal(const T1& a, const T2& b,
     }
 }
 
+void assert_gequal(char a, char b,
+                   const std::string& filename = "unknown", int lineno = -1)
+{
+    if(!(a >= b)){
+        unittest::UnitTestFailure f;
+        f << "[" << filename << ":" << lineno << "] ";
+        f << int(a) << " is less than " << int(b);
+        f << " [type='" << type_name<char>() << "']";
+        throw f;
+    }
+}
+
 // define our own abs() because std::abs() isn't portable for all types for some reason
 template<typename T>
   T abs(const T &x)
@@ -124,7 +312,7 @@ bool almost_equal(const double& a, const double& b, const double& a_tol, const d
 }
 
 template <typename T1, typename T2>
-void assert_almost_equal(const T1& a, const T2& b, 
+void assert_almost_equal(T1 a, T2 b,
                          const std::string& filename = "unknown", int lineno = -1,
                          double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
 
@@ -140,7 +328,7 @@ void assert_almost_equal(const T1& a, const T2& b,
 
 
 template <typename T1, typename T2>
-  void assert_almost_equal(const thrust::complex<T1>& a, const thrust::complex<T2>& b, 
+void assert_almost_equal(THRUST_NS_QUALIFIER::complex<T1> a, THRUST_NS_QUALIFIER::complex<T2> b,
                          const std::string& filename = "unknown", int lineno = -1,
                          double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
 
@@ -156,7 +344,7 @@ template <typename T1, typename T2>
 
 
 template <typename T1, typename T2>
-  void assert_almost_equal(const thrust::complex<T1>& a, const std::complex<T2>& b, 
+  void assert_almost_equal(const THRUST_NS_QUALIFIER::complex<T1>& a, const std::complex<T2>& b,
                          const std::string& filename = "unknown", int lineno = -1,
                          double a_tol = DEFAULT_ABSOLUTE_TOL, double r_tol = DEFAULT_RELATIVE_TOL)
 
@@ -183,14 +371,14 @@ class almost_equal_to
 
 
 template <typename T>
-class almost_equal_to<thrust::complex<T> >
+class almost_equal_to<THRUST_NS_QUALIFIER::complex<T> >
 {
     public:
         double a_tol, r_tol;
         almost_equal_to(double _a_tol = DEFAULT_ABSOLUTE_TOL, double _r_tol = DEFAULT_RELATIVE_TOL) : a_tol(_a_tol), r_tol(_r_tol) {}
-        bool operator()(const thrust::complex<T>& a, const thrust::complex<T>& b) const {
-	  return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol) && 
-	    almost_equal((double) a.imag(), (double) b.imag(), a_tol, r_tol);
+        bool operator()(const THRUST_NS_QUALIFIER::complex<T>& a, const THRUST_NS_QUALIFIER::complex<T>& b) const {
+            return almost_equal((double) a.real(), (double) b.real(), a_tol, r_tol)
+                && almost_equal((double) a.imag(), (double) b.imag(), a_tol, r_tol);
         }
 };
 
@@ -201,15 +389,15 @@ template <typename ForwardIterator1, typename ForwardIterator2, typename BinaryP
 void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2, BinaryPredicate op,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    typedef typename thrust::iterator_difference<ForwardIterator1>::type difference_type;
-    typedef typename thrust::iterator_value<ForwardIterator1>::type InputType;
-    
+    typedef typename THRUST_NS_QUALIFIER::iterator_difference<ForwardIterator1>::type difference_type;
+    typedef typename THRUST_NS_QUALIFIER::iterator_value<ForwardIterator1>::type InputType;
+
     bool failure = false;
 
-    difference_type length1 = thrust::distance(first1, last1);
-    difference_type length2 = thrust::distance(first2, last2);
-    
-    difference_type min_length = thrust::min(length1, length2);
+    difference_type length1 = THRUST_NS_QUALIFIER::distance(first1, last1);
+    difference_type length2 = THRUST_NS_QUALIFIER::distance(first2, last2);
+
+    difference_type min_length = THRUST_NS_QUALIFIER::min(length1, length2);
 
     unittest::UnitTestFailure f;
     f << "[" << filename << ":" << lineno << "] ";
@@ -222,7 +410,7 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat
     }
 
     // check values
-    
+
     size_t mismatches = 0;
 
     for (difference_type i = 0; i < min_length; i++)
@@ -240,10 +428,14 @@ void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterat
 
         if(mismatches <= MAX_OUTPUT_LINES)
         {
-          if (sizeof(InputType) == 1)
+          THRUST_IF_CONSTEXPR(sizeof(InputType) == 1)
+          {
             f << "  [" << i << "] " << *first1 + InputType() << "  " << *first2 + InputType() << "\n"; // unprintable chars are a problem
+          }
           else
+          {
             f << "  [" << i << "] " << *first1 << "  " << *first2 << "\n";
+          }
         }
       }
 
@@ -271,8 +463,8 @@ template <typename ForwardIterator1, typename ForwardIterator2>
 void assert_equal(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, ForwardIterator2 last2,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
-    assert_equal(first1, last1, first2, last2, thrust::equal_to<InputType>(), filename, lineno);
+    typedef typename THRUST_NS_QUALIFIER::iterator_traits<ForwardIterator1>::value_type InputType;
+    assert_equal(first1, last1, first2, last2, THRUST_NS_QUALIFIER::equal_to<InputType>(), filename, lineno);
 }
 
 
@@ -281,77 +473,237 @@ void assert_almost_equal(ForwardIterator1 first1, ForwardIterator1 last1, Forwar
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType;
+    typedef typename THRUST_NS_QUALIFIER::iterator_traits<ForwardIterator1>::value_type InputType;
     assert_equal(first1, last1, first2, last2, almost_equal_to<InputType>(a_tol, r_tol), filename, lineno);
 }
 
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
+}
 
-template <typename T, typename Alloc>
-void assert_equal(const thrust::host_vector<T,Alloc>& A, const thrust::host_vector<T,Alloc>& B,
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_equal(A, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc2> A_host = A;
+    assert_equal(A_host, B, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    THRUST_NS_QUALIFIER::host_vector<T> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T> B_host = B;
+    assert_equal(A_host, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
     assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
-template <typename T, typename Alloc>
-void assert_almost_equal(const thrust::host_vector<T,Alloc>& A, const thrust::host_vector<T,Alloc>& B, 
-                         const std::string& filename = "unknown", int lineno = -1,
-                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
 {
-    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T,Alloc1> B_host = B;
-    assert_equal(A, B_host, filename, lineno);
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+void assert_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T,Alloc2> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> A_host = A;
     assert_equal(A_host, B, filename, lineno);
 }
 
-template <typename T, typename Alloc>
-void assert_equal(const thrust::device_vector<T,Alloc>& A, const thrust::device_vector<T,Alloc>& B,
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                  const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
                   const std::string& filename = "unknown", int lineno = -1)
 {
-    thrust::host_vector<T> A_host = A;
-    thrust::host_vector<T> B_host = B;
-    assert_equal(A_host, B_host, filename, lineno);
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_equal(A, B_host, filename, lineno);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_equal(const std::vector<T,Alloc1>& A, const std::vector<T,Alloc2>& B,
+                  const std::string& filename = "unknown", int lineno = -1)
+{
+    assert_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::host_vector<T,Alloc1>& A, const thrust::device_vector<T,Alloc2>& B,
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T,Alloc1> B_host = B;
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
     assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol);
 }
 
 template <typename T, typename Alloc1, typename Alloc2>
-void assert_almost_equal(const thrust::device_vector<T,Alloc1>& A, const thrust::host_vector<T,Alloc2>& B,
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T,Alloc2> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc2> A_host = A;
     assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol);
 }
 
-template <typename T, typename Alloc>
-void assert_almost_equal(const thrust::device_vector<T,Alloc>& A, const thrust::device_vector<T,Alloc>& B,
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
                          const std::string& filename = "unknown", int lineno = -1,
                          const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
 {
-    thrust::host_vector<T> A_host = A;
-    thrust::host_vector<T> B_host = B;
+    THRUST_NS_QUALIFIER::host_vector<T> A_host = A;
+    THRUST_NS_QUALIFIER::host_vector<T> B_host = B;
     assert_almost_equal(A_host, B_host, filename, lineno, a_tol, r_tol);
 }
 
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::host_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::host_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::device_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::universal_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> A_host = A;
+    assert_almost_equal(A_host, B, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const THRUST_NS_QUALIFIER::universal_vector<T,Alloc1>& A,
+                         const THRUST_NS_QUALIFIER::device_vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    THRUST_NS_QUALIFIER::host_vector<T,Alloc1> B_host = B;
+    assert_almost_equal(A, B_host, filename, lineno, a_tol, r_tol);
+}
+
+template <typename T, typename Alloc1, typename Alloc2>
+void assert_almost_equal(const std::vector<T,Alloc1>& A, const std::vector<T,Alloc2>& B,
+                         const std::string& filename = "unknown", int lineno = -1,
+                         const double a_tol = DEFAULT_ABSOLUTE_TOL, const double r_tol = DEFAULT_RELATIVE_TOL)
+{
+    assert_almost_equal(A.begin(), A.end(), B.begin(), B.end(), filename, lineno, a_tol, r_tol);
+}
+
+enum threw_status
+{
+  did_not_throw
+, threw_wrong_type
+, threw_right_type_but_wrong_value
+, threw_right_type
+};
+
+void check_assert_throws(
+  threw_status s
+, std::string const& exception_name
+, std::string const& file_name = "unknown"
+, int line_number = -1
+)
+{
+  switch (s)
+  {
+    case did_not_throw:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] did not throw anything";
+      throw f;
+    }
+    case threw_wrong_type:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] did not throw an "
+        << "object of type " << exception_name;
+      throw f;
+    }
+    case threw_right_type_but_wrong_value:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] threw an object of the "
+        << "correct type (" << exception_name << ") but wrong value";
+      throw f;
+    }
+    case threw_right_type:
+      break;
+    default:
+    {
+      unittest::UnitTestFailure f;
+      f << "[" << file_name << ":" << line_number << "] encountered an "
+        << "unknown error";
+      throw f;
+    }
+  }
+}
+
 }; //end namespace unittest
diff --git a/testing/backend/cuda/testframework.cu b/testing/unittest/cuda/testframework.cu
similarity index 75%
rename from testing/backend/cuda/testframework.cu
rename to testing/unittest/cuda/testframework.cu
index 6fb52f9b2..ff30f368c 100644
--- a/testing/backend/cuda/testframework.cu
+++ b/testing/unittest/cuda/testframework.cu
@@ -1,7 +1,8 @@
 #include <unittest/testframework.h>
+#include <unittest/cuda/testframework.h>
 #include <thrust/system/cuda/memory.h>
 #include <cuda_runtime.h>
-#include "testframework.h"
+#include <numeric>
 
 __global__ void dummy_kernel() {}
 
@@ -12,7 +13,12 @@ bool binary_exists_for_current_device()
   // we didn't compile a binary compatible with the current device
   cudaFuncAttributes attr;
   cudaError_t error = cudaFuncGetAttributes(&attr, dummy_kernel);
-  return error == cudaSuccess;
+
+  // clear the CUDA global error state if we just set it, so that
+  // check_cuda_error doesn't complain
+  if (cudaSuccess != error) (void)cudaGetLastError();
+
+  return cudaSuccess == error;
 }
 
 void list_devices(void)
@@ -23,15 +29,15 @@ void list_devices(void)
   {
     std::cout << "There is no device supporting CUDA" << std::endl;
   }
-  
+
   int selected_device;
   cudaGetDevice(&selected_device);
-  
+
   for (int dev = 0; dev < deviceCount; ++dev)
   {
     cudaDeviceProp deviceProp;
     cudaGetDeviceProperties(&deviceProp, dev);
-    
+
     if(dev == 0)
     {
       if(deviceProp.major == 9999 && deviceProp.minor == 9999)
@@ -41,12 +47,12 @@ void list_devices(void)
       else
         std::cout << "There are " << deviceCount <<  " devices supporting CUDA" << std:: endl;
     }
-    
+
     std::cout << "\nDevice " << dev << ": \"" << deviceProp.name << "\"";
     if(dev == selected_device)
       std::cout << "  [SELECTED]";
     std::cout << std::endl;
-    
+
     std::cout << "  Major revision number:                         " << deviceProp.major << std::endl;
     std::cout << "  Minor revision number:                         " << deviceProp.minor << std::endl;
     std::cout << "  Total amount of global memory:                 " << deviceProp.totalGlobalMem << " bytes" << std::endl;
@@ -64,62 +70,64 @@ template<typename Iterator> Iterator my_next(Iterator iter)
 std::vector<int> CUDATestDriver::target_devices(const ArgumentMap &kwargs)
 {
   std::vector<int> result;
-  
+
   // by default, test all devices in the system (device id -1)
   int device_id = kwargs.count("device") ? atoi(kwargs.find("device")->second.c_str()) : -1;
-  
+
   if(device_id < 0)
   {
     // target all devices in the system
     int count = 0;
     cudaGetDeviceCount(&count);
-    
+
     result.resize(count);
-    // XXX iota is not available in c++03
-    for(int i = 0; i < count; ++i)
-      result[i] = i;
+    std::iota(result.begin(), result.end(), 0);
   }
   else
   {
     // target the specified device
     result = std::vector<int>(1,device_id);
   }
-  
+
   return result;
 }
 
 bool CUDATestDriver::check_cuda_error(bool concise)
 {
-  cudaError_t error = cudaGetLastError();
-  if(error)
+  cudaError_t const error = cudaGetLastError();
+  if(cudaSuccess != error)
   {
     if(!concise)
     {
-      std::cout << "[ERROR] CUDA Error detected before running tests: [";
-      std::cout << std::string(cudaGetErrorString(error));
-      std::cout << "]" << std::endl;
+      std::cout << "[ERROR] CUDA error detected before running tests: ["
+                << std::string(cudaGetErrorName(error))
+                << ": "
+                << std::string(cudaGetErrorString(error))
+                << "]" << std::endl;
     }
-  } 
+  }
 
-  return error;
+  return cudaSuccess != error;
 }
 
-bool CUDATestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
+bool CUDATestDriver::post_test_smoke_check(const UnitTest &test, bool concise)
 {
-  cudaError_t error = cudaGetLastError();
-  if(error && error != cudaErrorMemoryAllocation)
+  cudaError_t const error = cudaDeviceSynchronize();
+  if(cudaSuccess != error)
   {
     if(!concise)
     {
-      std::cout << "\t[ERROR] CUDA Error detected after running " << test.name << ": [";
-      std::cout << std::string(cudaGetErrorString(error));
-      std::cout << "]" << std::endl;
+      std::cout << "\t[ERROR] CUDA error detected after running " << test.name << ": ["
+                << std::string(cudaGetErrorName(error))
+                << ": "
+                << std::string(cudaGetErrorString(error))
+                << "]" << std::endl;
     }
   }
 
-  return error == cudaSuccess;
+  return cudaSuccess == error;
 }
-  
+
 bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwargs)
 {
   bool verbose = kwargs.count("verbose");
@@ -133,22 +141,24 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
 
   // check error status before doing anything
   if(check_cuda_error(concise)) return false;
-  
+
   bool result = true;
 
   if(kwargs.count("verbose"))
   {
     list_devices();
   }
-  
+
   // figure out which devices to target
   std::vector<int> devices = target_devices(kwargs);
-  
+
   // target each device
   for(std::vector<int>::iterator device = devices.begin();
       device != devices.end();
       ++device)
   {
+    cudaDeviceSynchronize();
+
     // set the device
     cudaSetDevice(*device);
 
@@ -156,7 +166,13 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
     // if none exists, skip the device silently unless this is the only one we're targeting
     if(devices.size() > 1 && !binary_exists_for_current_device())
     {
-      continue;     
+      // note which device we're skipping
+      cudaDeviceProp deviceProp;
+      cudaGetDeviceProperties(&deviceProp, *device);
+
+      std::cout << "Skipping Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
+
+      continue;
     }
 
     if(!concise)
@@ -164,23 +180,23 @@ bool CUDATestDriver::run_tests(const ArgumentSet &args, const ArgumentMap &kwarg
       // note which device we're testing
       cudaDeviceProp deviceProp;
       cudaGetDeviceProperties(&deviceProp, *device);
-      
+
       std::cout << "Testing Device " << *device << ": \"" << deviceProp.name << "\"" << std::endl;
     }
 
     // check error status before running any tests
     if(check_cuda_error(concise)) return false;
-    
+
     // run tests
     result &= UnitTestDriver::run_tests(args, kwargs);
-    
+
     if(!concise && my_next(device) != devices.end())
     {
       // provide some separation between the output of separate tests
       std::cout << std::endl;
     }
   }
-  
+
   return result;
 }
 
diff --git a/testing/backend/cuda/testframework.h b/testing/unittest/cuda/testframework.h
similarity index 86%
rename from testing/backend/cuda/testframework.h
rename to testing/unittest/cuda/testframework.h
index 953f88c1c..34a3dce5a 100644
--- a/testing/backend/cuda/testframework.h
+++ b/testing/unittest/cuda/testframework.h
@@ -16,7 +16,7 @@ class CUDATestDriver
 
     bool check_cuda_error(bool concise);
 
-    virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+    virtual bool post_test_smoke_check(const UnitTest &test, bool concise);
 
     virtual bool run_tests(const ArgumentSet &args, const ArgumentMap &kwargs);
 };
diff --git a/testing/unittest/meta.h b/testing/unittest/meta.h
index 9a2b6d8a8..ed492634b 100644
--- a/testing/unittest/meta.h
+++ b/testing/unittest/meta.h
@@ -13,49 +13,10 @@ namespace unittest
 struct null_type {}; 
 
 // this type encapsulates a list of
-// up to 10 types
-template<typename T0 = null_type,
-         typename T1 = null_type,
-         typename T2 = null_type,
-         typename T3 = null_type,
-         typename T4 = null_type,
-         typename T5 = null_type,
-         typename T6 = null_type,
-         typename T7 = null_type,
-         typename T8 = null_type,
-         typename T9 = null_type,
-         typename T10 = null_type,
-         typename T11 = null_type,
-         typename T12 = null_type,
-         typename T13 = null_type,
-         typename T14 = null_type,
-         typename T15 = null_type,
-         typename T16 = null_type,
-         typename T17 = null_type,
-         typename T18 = null_type,
-         typename T19 = null_type>
+// types
+template<typename... Ts>
   struct type_list
 {
-  typedef T0 type_0;
-  typedef T1 type_1;
-  typedef T2 type_2;
-  typedef T3 type_3;
-  typedef T4 type_4;
-  typedef T5 type_5;
-  typedef T6 type_6;
-  typedef T7 type_7;
-  typedef T8 type_8;
-  typedef T9 type_9;
-  typedef T10 type_10;
-  typedef T11 type_11;
-  typedef T12 type_12;
-  typedef T13 type_13;
-  typedef T14 type_14;
-  typedef T15 type_15;
-  typedef T16 type_16;
-  typedef T17 type_17;
-  typedef T18 type_18;
-  typedef T19 type_19;
 };
 
 // this type provides a way of indexing
@@ -66,26 +27,17 @@ template<typename List, unsigned int i>
   typedef null_type type;
 };
 
-template<typename List>  struct get_type<List,0> { typedef typename List::type_0 type; };
-template<typename List>  struct get_type<List,1> { typedef typename List::type_1 type; };
-template<typename List>  struct get_type<List,2> { typedef typename List::type_2 type; };
-template<typename List>  struct get_type<List,3> { typedef typename List::type_3 type; };
-template<typename List>  struct get_type<List,4> { typedef typename List::type_4 type; };
-template<typename List>  struct get_type<List,5> { typedef typename List::type_5 type; };
-template<typename List>  struct get_type<List,6> { typedef typename List::type_6 type; };
-template<typename List>  struct get_type<List,7> { typedef typename List::type_7 type; };
-template<typename List>  struct get_type<List,8> { typedef typename List::type_8 type; };
-template<typename List>  struct get_type<List,9> { typedef typename List::type_9 type; };
-template<typename List>  struct get_type<List,10> { typedef typename List::type_10 type; };
-template<typename List>  struct get_type<List,11> { typedef typename List::type_11 type; };
-template<typename List>  struct get_type<List,12> { typedef typename List::type_12 type; };
-template<typename List>  struct get_type<List,13> { typedef typename List::type_13 type; };
-template<typename List>  struct get_type<List,14> { typedef typename List::type_14 type; };
-template<typename List>  struct get_type<List,15> { typedef typename List::type_15 type; };
-template<typename List>  struct get_type<List,16> { typedef typename List::type_16 type; };
-template<typename List>  struct get_type<List,17> { typedef typename List::type_17 type; };
-template<typename List>  struct get_type<List,18> { typedef typename List::type_18 type; };
-template<typename List>  struct get_type<List,19> { typedef typename List::type_19 type; };
+template<typename T, typename... Ts>
+  struct get_type<type_list<T, Ts...>, 0>
+{
+  typedef T type;
+};
+
+template<typename T, typename... Ts, unsigned int i>
+  struct get_type<type_list<T, Ts...>, i>
+{
+  typedef typename get_type<type_list<Ts...>, i - 1>::type type;
+};
 
 // this type and its specialization provides a way to
 // iterate over a type_list, and
@@ -133,7 +85,7 @@ template<typename TypeList,
   struct for_each_type<TypeList, Function, null_type, i>
 {
   template<typename U>
-    void operator()(U n)
+    void operator()(U)
   {
     // no-op
   }
@@ -196,64 +148,26 @@ template<template <typename,typename> class Template>
 // the Type_list's types
 template<typename TypeList,
          template <typename> class Template>
-  struct transform1
-{
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,0>::type>::type type_0;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,1>::type>::type type_1;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,2>::type>::type type_2;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,3>::type>::type type_3;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,4>::type>::type type_4;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,5>::type>::type type_5;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,6>::type>::type type_6;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,7>::type>::type type_7;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,8>::type>::type type_8;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,9>::type>::type type_9;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,10>::type>::type type_10;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,11>::type>::type type_11;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,12>::type>::type type_12;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,13>::type>::type type_13;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,14>::type>::type type_14;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,15>::type>::type type_15;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,16>::type>::type type_16;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,17>::type>::type type_17;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,18>::type>::type type_18;
-  typedef typename ApplyTemplate1<Template, typename get_type<TypeList,19>::type>::type type_19;
+  struct transform1;
 
-  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
-                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+template<typename... Ts,
+         template <typename> class Template>
+  struct transform1<type_list<Ts...>, Template>
+{
+  typedef type_list<typename ApplyTemplate1<Template, Ts>::type...> type;
 };
 
-// this type creates a new type_list by applying a Template to each of
-// two type_list's types
 template<typename TypeList1,
          typename TypeList2,
          template <typename,typename> class Template>
-  struct transform2
-{
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,0>::type, typename get_type<TypeList2,0>::type>::type type_0;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,1>::type, typename get_type<TypeList2,1>::type>::type type_1;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,2>::type, typename get_type<TypeList2,2>::type>::type type_2;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,3>::type, typename get_type<TypeList2,3>::type>::type type_3;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,4>::type, typename get_type<TypeList2,4>::type>::type type_4;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,5>::type, typename get_type<TypeList2,5>::type>::type type_5;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,6>::type, typename get_type<TypeList2,6>::type>::type type_6;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,7>::type, typename get_type<TypeList2,7>::type>::type type_7;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,8>::type, typename get_type<TypeList2,8>::type>::type type_8;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,9>::type, typename get_type<TypeList2,9>::type>::type type_9;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,10>::type, typename get_type<TypeList2,10>::type>::type type_10;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,11>::type, typename get_type<TypeList2,11>::type>::type type_11;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,12>::type, typename get_type<TypeList2,12>::type>::type type_12;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,13>::type, typename get_type<TypeList2,13>::type>::type type_13;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,14>::type, typename get_type<TypeList2,14>::type>::type type_14;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,15>::type, typename get_type<TypeList2,15>::type>::type type_15;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,16>::type, typename get_type<TypeList2,16>::type>::type type_16;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,17>::type, typename get_type<TypeList2,17>::type>::type type_17;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,18>::type, typename get_type<TypeList2,18>::type>::type type_18;
-  typedef typename ApplyTemplate2<Template, typename get_type<TypeList1,19>::type, typename get_type<TypeList2,19>::type>::type type_19;
-  
+  struct transform2;
 
-  typedef type_list<type_0, type_1, type_2, type_3, type_4, type_5, type_6, type_7, type_8, type_9,
-                    type_10, type_11, type_12, type_13, type_14, type_15, type_16, type_17, type_18, type_19> type;
+template<typename... T1s,
+         typename... T2s,
+         template <typename,typename> class Template>
+  struct transform2<type_list<T1s...>, type_list<T2s...>, Template>
+{
+  typedef type_list<typename ApplyTemplate2<Template, T1s, T2s>::type...> type;
 };
 
 } // end unittest
diff --git a/testing/unittest/random.h b/testing/unittest/random.h
index a46b8e5b3..c94c3fecb 100644
--- a/testing/unittest/random.h
+++ b/testing/unittest/random.h
@@ -4,6 +4,8 @@
 #include <thrust/random.h>
 #include <thrust/detail/type_traits.h>
 
+#include <limits>
+
 namespace unittest
 {
 
@@ -18,36 +20,66 @@ inline unsigned int hash(unsigned int a)
     return a;
 }
 
-template<typename T, bool is_float = thrust::detail::is_floating_point<T>::value>
-  struct random_integer
+template<typename T, typename = void>
+  struct generate_random_integer;
+
+template<typename T>
+  struct generate_random_integer<T,
+    typename THRUST_NS_QUALIFIER::detail::disable_if<
+      THRUST_NS_QUALIFIER::detail::is_non_bool_arithmetic<T>::value
+    >::type
+  >
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<T> dist;
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+
+      return static_cast<T>(rng());
+  }
+};
+
+template<typename T>
+  struct generate_random_integer<T,
+    typename THRUST_NS_QUALIFIER::detail::enable_if<
+      THRUST_NS_QUALIFIER::detail::is_non_bool_integral<T>::value
+    >::type
+  >
+{
+  T operator()(unsigned int i) const
+  {
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<T> dist;
 
       return static_cast<T>(dist(rng));
   }
 };
 
 template<typename T>
-  struct random_integer<T,true>
+  struct generate_random_integer<T,
+    typename THRUST_NS_QUALIFIER::detail::enable_if<
+      THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
+    >::type
+  >
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
+      T const min = std::numeric_limits<T>::min();
+      T const max = std::numeric_limits<T>::max();
 
-      return static_cast<T>(rng());
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_real_distribution<T> dist(min, max);
+
+      return static_cast<T>(dist(rng));
   }
 };
 
 template<>
-  struct random_integer<bool,false>
+  struct generate_random_integer<bool>
 {
   bool operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<unsigned int> dist(0,1);
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<unsigned int> dist(0,1);
 
       return dist(rng) == 1;
   }
@@ -55,12 +87,12 @@ template<>
 
 
 template<typename T>
-  struct random_sample
+  struct generate_random_sample
 {
   T operator()(unsigned int i) const
   {
-      thrust::default_random_engine rng(hash(i));
-      thrust::uniform_int_distribution<unsigned int> dist(0,20);
+      THRUST_NS_QUALIFIER::default_random_engine rng(hash(i));
+      THRUST_NS_QUALIFIER::uniform_int_distribution<unsigned int> dist(0,20);
 
       return static_cast<T>(dist(rng));
   } 
@@ -69,25 +101,31 @@ template<typename T>
 
 
 template<typename T>
-thrust::host_vector<T> random_integers(const size_t N)
+THRUST_NS_QUALIFIER::host_vector<T> random_integers(const size_t N)
 {
-    thrust::host_vector<T> vec(N);
-    thrust::transform(thrust::counting_iterator<size_t>(0),
-                      thrust::counting_iterator<size_t>(N),
-                      vec.begin(),
-                      random_integer<T>());
+    THRUST_NS_QUALIFIER::host_vector<T> vec(N);
+    THRUST_NS_QUALIFIER::transform(THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                                   THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                                   vec.begin(),
+                                   generate_random_integer<T>());
 
     return vec;
 }
 
 template<typename T>
-thrust::host_vector<T> random_samples(const size_t N)
+T random_integer()
+{
+    return generate_random_integer<T>()(0);
+}
+
+template<typename T>
+THRUST_NS_QUALIFIER::host_vector<T> random_samples(const size_t N)
 {
-    thrust::host_vector<T> vec(N);
-    thrust::transform(thrust::counting_iterator<size_t>(0),
-                      thrust::counting_iterator<size_t>(N),
-                      vec.begin(),
-                      random_sample<T>());
+    THRUST_NS_QUALIFIER::host_vector<T> vec(N);
+    THRUST_NS_QUALIFIER::transform(THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(0)),
+                                   THRUST_NS_QUALIFIER::counting_iterator<unsigned int>(static_cast<unsigned int>(N)),
+                                   vec.begin(),
+                                   generate_random_sample<T>());
 
     return vec;
 }
diff --git a/testing/unittest/runtime_static_assert.h b/testing/unittest/runtime_static_assert.h
new file mode 100644
index 000000000..d53bd3b20
--- /dev/null
+++ b/testing/unittest/runtime_static_assert.h
@@ -0,0 +1,98 @@
+#pragma once
+
+#include <string>
+
+#include <thrust/detail/static_assert.h>
+#undef THRUST_STATIC_ASSERT
+#undef THRUST_STATIC_ASSERT_MSG
+
+#define THRUST_STATIC_ASSERT(B) unittest::assert_static((B), __FILE__, __LINE__);
+#define THRUST_STATIC_ASSERT_MSG(B, msg) unittest::assert_static((B), __FILE__, __LINE__);
+
+namespace unittest
+{
+    __host__ __device__
+    void assert_static(bool condition, const char * filename, int lineno);
+}
+
+#include <thrust/device_new.h>
+#include <thrust/device_delete.h>
+
+#include <nv/target>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+
+
+#define ASSERT_STATIC_ASSERT(X) \
+    { \
+        bool triggered = false; \
+        typedef unittest::static_assert_exception ex_t; \
+        thrust::device_ptr<ex_t> device_ptr = thrust::device_new<ex_t>(); \
+        ex_t* raw_ptr = thrust::raw_pointer_cast(device_ptr); \
+        ::cudaMemcpyToSymbol(unittest::detail::device_exception, &raw_ptr, sizeof(ex_t*)); \
+        try { X; } catch (ex_t) { triggered = true; } \
+        if (!triggered) { \
+            triggered = static_cast<ex_t>(*device_ptr).triggered; \
+        } \
+        thrust::device_free(device_ptr); \
+        raw_ptr = NULL; \
+        ::cudaMemcpyToSymbol(unittest::detail::device_exception, &raw_ptr, sizeof(ex_t*)); \
+        if (!triggered) { unittest::UnitTestFailure f; f << "[" << __FILE__ << ":" << __LINE__ << "] did not trigger a THRUST_STATIC_ASSERT"; throw f; } \
+    }
+
+#else
+
+#define ASSERT_STATIC_ASSERT(X) \
+    { \
+        bool triggered = false; \
+        typedef unittest::static_assert_exception ex_t; \
+        try { X; } catch (ex_t) { triggered = true; } \
+        if (!triggered) { unittest::UnitTestFailure f; f << "[" << __FILE__ << ":" << __LINE__ << "] did not trigger a THRUST_STATIC_ASSERT"; throw f; } \
+    }
+
+#endif
+
+namespace unittest
+{
+    class static_assert_exception
+    {
+    public:
+        __host__ __device__
+        static_assert_exception() : triggered(false)
+        {
+        }
+
+        __host__ __device__
+        static_assert_exception(const char * filename, int lineno)
+            : triggered(true), filename(filename), lineno(lineno)
+        {
+        }
+
+        bool triggered;
+        const char * filename;
+        int lineno;
+    };
+
+    namespace detail
+    {
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+        __attribute__((used))
+#endif
+        __device__ static static_assert_exception* device_exception = NULL;
+    }
+
+    __host__ __device__
+    void assert_static(bool condition, const char * filename, int lineno)
+    {
+        if (!condition)
+        {
+            static_assert_exception ex(filename, lineno);
+
+            NV_IF_TARGET(NV_IS_DEVICE,
+                         (*detail::device_exception = ex;),
+                         (throw ex;));
+        }
+    }
+}
+
diff --git a/testing/unittest/special_types.h b/testing/unittest/special_types.h
index b046a96ee..9e4b0b743 100644
--- a/testing/unittest/special_types.h
+++ b/testing/unittest/special_types.h
@@ -128,7 +128,11 @@ void swap(user_swappable &x, user_swappable &y)
   y.was_swapped = false;
 }
 
-class my_system : public thrust::device_execution_policy<my_system>
+// Inheriting from classes in anonymous namespaces is not allowed.
+// The anonymous namespace tests don't use these, so just disable them:
+#ifndef THRUST_USE_ANON_NAMESPACE
+
+class my_system : public THRUST_NS_QUALIFIER::device_execution_policy<my_system>
 {
   public:
     my_system(int)
@@ -163,21 +167,23 @@ class my_system : public thrust::device_execution_policy<my_system>
     my_system();
 };
 
-struct my_tag : thrust::device_execution_policy<my_tag> {};
+struct my_tag : THRUST_NS_QUALIFIER::device_execution_policy<my_tag> {};
+
+#endif // THRUST_USE_ANON_NAMESPACE
 
 namespace unittest
 {
 
 
-using thrust::detail::int8_t;
-using thrust::detail::int16_t;
-using thrust::detail::int32_t;
-using thrust::detail::int64_t;
+using THRUST_NS_QUALIFIER::detail::int8_t;
+using THRUST_NS_QUALIFIER::detail::int16_t;
+using THRUST_NS_QUALIFIER::detail::int32_t;
+using THRUST_NS_QUALIFIER::detail::int64_t;
 
-using thrust::detail::uint8_t;
-using thrust::detail::uint16_t;
-using thrust::detail::uint32_t;
-using thrust::detail::uint64_t;
+using THRUST_NS_QUALIFIER::detail::uint8_t;
+using THRUST_NS_QUALIFIER::detail::uint16_t;
+using THRUST_NS_QUALIFIER::detail::uint32_t;
+using THRUST_NS_QUALIFIER::detail::uint64_t;
 
   
 }
diff --git a/testing/unittest/system.h b/testing/unittest/system.h
index f3602e994..766e732d3 100644
--- a/testing/unittest/system.h
+++ b/testing/unittest/system.h
@@ -12,7 +12,7 @@
 namespace unittest
 {
 
-#ifdef __GNUC__
+#if __GNUC__ && !_NVHPC_CUDA
 inline std::string demangle(const char* name)
 {
   int status = 0;
diff --git a/testing/testframework.cpp b/testing/unittest/testframework.cu
similarity index 91%
rename from testing/testframework.cpp
rename to testing/unittest/testframework.cu
index 88a184792..67d970399 100644
--- a/testing/testframework.cpp
+++ b/testing/unittest/testframework.cu
@@ -4,7 +4,7 @@
 
 // #include backends' testframework.h, if they exist and are required for the build
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include "backend/cuda/testframework.h"
+#include <unittest/cuda/testframework.h>
 #endif
 
 #include <iostream>
@@ -30,7 +30,7 @@ const size_t standard_test_sizes[] =
   (1 << 26) + 1, (1 << 27) - 1, (1 << 27)
 };
 
-        
+
 const size_t tiny_threshold    = 1 <<  5;  //   32
 const size_t small_threshold   = 1 <<  8;  //  256
 const size_t medium_threshold  = 1 << 12;  //   4K
@@ -38,7 +38,7 @@ const size_t default_threshold = 1 << 16;  //  64K
 const size_t large_threshold   = 1 << 20;  //   1M
 const size_t huge_threshold    = 1 << 24;  //  16M
 const size_t epic_threshold    = 1 << 26;  //  64M
-const size_t max_threshold     = std::numeric_limits<size_t>::max();
+const size_t max_threshold     = (std::numeric_limits<size_t>::max)();
 
 
 std::vector<size_t> test_sizes;
@@ -110,9 +110,9 @@ void process_args(int argc, char ** argv,
   {
     std::string arg(argv[i]);
 
-    // look for --key or --key=value arguments 
+    // look for --key or --key=value arguments
     if(arg.substr(0,2) == "--")
-    {   
+    {
       std::string::size_type n = arg.find('=',2);
 
       if(n == std::string::npos)
@@ -132,10 +132,10 @@ void process_args(int argc, char ** argv,
 }
 
 
-void usage(int argc, char** argv)
+void usage(int /*argc*/, char** argv)
 {
   std::string indent = "  ";
-  
+
   std::cout << "Example Usage:\n";
   std::cout << indent << argv[0] << "\n";
   std::cout << indent << argv[0] << " TestName1 [TestName2 ...] \n";
@@ -164,14 +164,14 @@ struct TestResult
   TestStatus  status;
   std::string name;
   std::string message;
-  
+
   // XXX use a c++11 timer result when available
   std::clock_t elapsed;
-  
+
   TestResult(const TestStatus status, std::clock_t elapsed, const UnitTest& u, const std::string& message = "")
       : status(status), name(u.name), message(message), elapsed(elapsed)
   {}
-  
+
   bool operator<(const TestResult& tr) const
   {
     if(status < tr.status)
@@ -199,20 +199,20 @@ void record_result(const TestResult& test_result, std::vector< TestResult >& tes
 void report_results(std::vector< TestResult >& test_results, double elapsed_minutes)
 {
   std::cout << std::endl;
-  
+
   std::string hline = "================================================================";
-  
+
   std::sort(test_results.begin(), test_results.end());
-  
+
   size_t num_passes = 0;
   size_t num_failures = 0;
   size_t num_known_failures = 0;
   size_t num_errors = 0;
-  
+
   for(size_t i = 0; i < test_results.size(); i++)
   {
     const TestResult& tr = test_results[i];
-    
+
     if(tr.status == Pass)
     {
       num_passes++;
@@ -220,7 +220,7 @@ void report_results(std::vector< TestResult >& test_results, double elapsed_minu
     else
     {
       std::cout << hline << std::endl;
-    
+
       switch(tr.status)
       {
         case Failure:
@@ -232,13 +232,13 @@ void report_results(std::vector< TestResult >& test_results, double elapsed_minu
         default:
           break;
       }
-    
+
       std::cout << ": " << tr.name << std::endl << tr.message << std::endl;
     }
   }
-  
+
   std::cout << hline << std::endl;
-  
+
   std::cout << "Totals: ";
   std::cout << num_failures << " failures, ";
   std::cout << num_known_failures << " known failures, ";
@@ -257,7 +257,7 @@ void UnitTestDriver::list_tests(void)
 }
 
 
-bool UnitTestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
+bool UnitTestDriver::post_test_smoke_check(const UnitTest &/*test*/, bool /*concise*/)
 {
   return true;
 }
@@ -266,60 +266,62 @@ bool UnitTestDriver::post_test_sanity_check(const UnitTest &test, bool concise)
 bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const ArgumentMap& kwargs)
 {
   std::time_t start_time = std::time(0);
-  
+
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN
   bool verbose = kwargs.count("verbose");
   bool concise = kwargs.count("concise");
-  
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END
+
   std::vector< TestResult > test_results;
-  
+
   if(verbose && concise)
   {
     std::cout << "--verbose and --concise cannot be used together" << std::endl;
     exit(EXIT_FAILURE);
   }
-  
+
   if(!concise)
   {
     std::cout << "Running " << tests_to_run.size() << " unit tests." << std::endl;
   }
-  
+
   for(size_t i = 0; i < tests_to_run.size(); i++)
   {
      UnitTest& test = *tests_to_run[i];
-  
+
      if(verbose)
      {
        std::cout << "Running " << test.name << "..." << std::flush;
      }
-  
+
      try
      {
        // time the test
        std::clock_t start = std::clock();
-  
+
        // run the test
        test.run();
-  
+
        // test passed
        record_result(TestResult(Pass, std::clock() - start, test), test_results);
-     } 
+     }
      catch(unittest::UnitTestFailure& f)
      {
-       record_result(TestResult(Failure, std::numeric_limits<std::clock_t>::max(), test, f.message), test_results);
+       record_result(TestResult(Failure, (std::numeric_limits<std::clock_t>::max)(), test, f.message), test_results);
      }
      catch(unittest::UnitTestKnownFailure& f)
      {
-       record_result(TestResult(KnownFailure, std::numeric_limits<std::clock_t>::max(), test, f.message), test_results);
+       record_result(TestResult(KnownFailure, (std::numeric_limits<std::clock_t>::max)(), test, f.message), test_results);
      }
      catch(std::bad_alloc& e)
      {
-       record_result(TestResult(Error, std::numeric_limits<std::clock_t>::max(), test, e.what()), test_results);
+       record_result(TestResult(Error, (std::numeric_limits<std::clock_t>::max)(), test, e.what()), test_results);
      }
      catch(unittest::UnitTestError& e)
      {
-       record_result(TestResult(Error, std::numeric_limits<std::clock_t>::max(), test, e.message), test_results);
+       record_result(TestResult(Error, (std::numeric_limits<std::clock_t>::max)(), test, e.message), test_results);
      }
-  
+
      // immediate report
      if(!concise)
      {
@@ -340,7 +342,7 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
            default:
              break;
          }
-  
+
          std::cout << " " << test.name << std::endl;
        }
        else
@@ -360,24 +362,24 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
          }
        }
      }
-  
-     if(!post_test_sanity_check(test, concise))
+
+     if(!post_test_smoke_check(test, concise))
      {
        return false;
      }
-  
+
      std::cout.flush();
   }
-  
+
   double elapsed_minutes = double(std::time(0) - start_time) / 60;
-  
+
   // summary report
   if(!concise)
   {
     report_results(test_results, elapsed_minutes);
   }
-  
-  
+
+
   // if any failures or errors return false
   for(size_t i = 0; i < test_results.size(); i++)
   {
@@ -386,7 +388,7 @@ bool UnitTestDriver::run_tests(std::vector<UnitTest *>& tests_to_run, const Argu
       return false;
     }
   }
-  
+
   // all tests pass or are known failures
   return true;
 }
@@ -398,35 +400,35 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
   {
     // run all tests
     std::vector<UnitTest *> tests_to_run;
-    
+
     for(TestMap::iterator iter = test_map.begin(); iter != test_map.end(); iter++)
     {
       tests_to_run.push_back(iter->second);
     }
-    
+
     return run_tests(tests_to_run, kwargs);
   }
   else
   {
     // all non-keyword arguments are assumed to be test names or partial test names
-  
+
     typedef TestMap::iterator               TestMapIterator;
-  
+
     // vector to accumulate tests
     std::vector<UnitTest *> tests_to_run;
-  
+
     for(ArgumentSet::const_iterator iter = args.begin(); iter != args.end(); iter++)
     {
       const std::string& arg = *iter;
-  
+
       size_t len = arg.size();
       size_t matches = 0;
-  
+
       if(arg[len-1] == '*')
       {
         // wildcard search
         std::string search = arg.substr(0,len-1);
-  
+
         TestMapIterator lb = test_map.lower_bound(search);
         while(lb != test_map.end())
         {
@@ -434,8 +436,8 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
           {
             break;
           }
-  
-          tests_to_run.push_back(lb->second); 
+
+          tests_to_run.push_back(lb->second);
           lb++;
           matches++;
         }
@@ -444,21 +446,21 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
       {
         // non-wildcard search
         TestMapIterator lb = test_map.find(arg);
-  
+
         if(lb != test_map.end())
         {
-          tests_to_run.push_back(lb->second); 
+          tests_to_run.push_back(lb->second);
           matches++;
         }
       }
-  
+
       if(matches == 0)
       {
         std::cout << "[ERROR] found no test names matching the pattern: " << arg << std::endl;
         return false;
       }
     }
-  
+
     return run_tests(tests_to_run, kwargs);
   }
 }
@@ -466,7 +468,7 @@ bool UnitTestDriver::run_tests(const ArgumentSet& args, const ArgumentMap& kwarg
 
 // driver_instance maps a DeviceSystem to a singleton UnitTestDriver
 template<typename DeviceSystem>
-UnitTestDriver &driver_instance(DeviceSystem tag)
+UnitTestDriver &driver_instance(DeviceSystem)
 {
   static UnitTestDriver s_instance;
   return s_instance;
@@ -485,21 +487,21 @@ int main(int argc, char **argv)
 {
   ArgumentSet args;
   ArgumentMap kwargs;
-  
+
   process_args(argc, argv, args, kwargs);
-  
+
   if(kwargs.count("help"))
   {
     usage(argc, argv);
     return 0;
   }
-  
+
   if(kwargs.count("list"))
   {
     UnitTestDriver::s_driver().list_tests();
     return 0;
   }
-  
+
   if(kwargs.count("sizes"))
   {
     set_test_sizes(kwargs["sizes"]);
@@ -508,14 +510,14 @@ int main(int argc, char **argv)
   {
     set_test_sizes("default");
   }
-  
+
   bool passed = UnitTestDriver::s_driver().run_tests(args, kwargs);
-  
+
   if(kwargs.count("concise"))
   {
     std::cout << ((passed) ? "PASSED" : "FAILED") << std::endl;
   }
-  
+
   return (passed) ? EXIT_SUCCESS : EXIT_FAILURE;
 }
 
diff --git a/testing/unittest/testframework.h b/testing/unittest/testframework.h
index fe608fb75..c6ced96e7 100644
--- a/testing/unittest/testframework.h
+++ b/testing/unittest/testframework.h
@@ -1,16 +1,24 @@
 #pragma once
 
+#include <cstdio>
+#include <iostream>
+#include <map>
+#include <set>
 #include <string>
+#include <type_traits>
 #include <vector>
-#include <set>
-#include <map>
-#include <iostream>
-
-#include <stdio.h>
 
 #include "meta.h"
 #include "util.h"
 
+#include <thrust/limits.h>
+#include <thrust/detail/config.h>
+#include <thrust/detail/integer_traits.h>
+#include <thrust/mr/host_memory_resource.h>
+#include <thrust/mr/device_memory_resource.h>
+#include <thrust/mr/universal_memory_resource.h>
+#include <thrust/mr/allocator.h>
+
 // define some common lists of types
 typedef unittest::type_list<int,
                             unsigned int,
@@ -60,6 +68,185 @@ typedef unittest::type_list<long long,
 typedef unittest::type_list<float,
                             double> FloatingPointTypes;
 
+// A type that behaves as if it was a normal numeric type,
+// so it can be used in the same tests as "normal" numeric types.
+// NOTE: This is explicitly NOT proclaimed trivially reloctable.
+class custom_numeric
+{
+public:
+    __host__ __device__
+    custom_numeric()
+    {
+        fill(0);
+    }
+
+    // Allow construction from any integral numeric.
+    template <typename T,
+              typename = typename std::enable_if<std::is_integral<T>::value>::type>
+    __host__ __device__
+    custom_numeric(const T& i)
+    {
+        fill(static_cast<int>(i));
+    }
+
+    __host__ __device__
+    custom_numeric(const custom_numeric & other)
+    {
+        fill(other.value[0]);
+    }
+
+    __host__ __device__
+    custom_numeric & operator=(int val)
+    {
+        fill(val);
+        return *this;
+    }
+
+    __host__ __device__
+    custom_numeric & operator=(const custom_numeric & other)
+    {
+        fill(other.value[0]);
+        return *this;
+    }
+
+    // cast to void * instead of bool to fool overload resolution
+    // WTB C++11 explicit conversion operators
+    __host__ __device__
+    operator void *() const
+    {
+        // static cast first to avoid MSVC warning C4312
+        return reinterpret_cast<void *>(static_cast<std::size_t>(value[0]));
+    }
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric & operator op() {                                \
+        fill(op value[0]);                                          \
+        return *this;                                               \
+    }                                                               \
+    __host__ __device__                                             \
+    custom_numeric operator op(int) const {                         \
+        custom_numeric ret(*this);                                  \
+        op ret;                                                     \
+        return ret;                                                 \
+    }
+
+    DEFINE_OPERATOR(++)
+    DEFINE_OPERATOR(--)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric operator op () const                             \
+    {                                                               \
+        return custom_numeric(op value[0]);                         \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(~)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric operator op (const custom_numeric & other) const \
+    {                                                               \
+        return custom_numeric(value[0] op other.value[0]);          \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(*)
+    DEFINE_OPERATOR(/)
+    DEFINE_OPERATOR(%)
+    DEFINE_OPERATOR(<<)
+    DEFINE_OPERATOR(>>)
+    DEFINE_OPERATOR(&)
+    DEFINE_OPERATOR(|)
+    DEFINE_OPERATOR(^)
+
+#undef DEFINE_OPERATOR
+
+#define CONCAT(X, Y) X ## Y
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    custom_numeric & operator CONCAT(op, =) (const custom_numeric & other) \
+    {                                                               \
+        fill(value[0] op other.value[0]);                           \
+        return *this;                                               \
+    }
+
+    DEFINE_OPERATOR(+)
+    DEFINE_OPERATOR(-)
+    DEFINE_OPERATOR(*)
+    DEFINE_OPERATOR(/)
+    DEFINE_OPERATOR(%)
+    DEFINE_OPERATOR(<<)
+    DEFINE_OPERATOR(>>)
+    DEFINE_OPERATOR(&)
+    DEFINE_OPERATOR(|)
+    DEFINE_OPERATOR(^)
+
+#undef DEFINE_OPERATOR
+
+#define DEFINE_OPERATOR(op)                                         \
+    __host__ __device__                                             \
+    friend bool operator op (const custom_numeric & lhs, const custom_numeric & rhs) \
+    {                                                               \
+        return lhs.value[0] op rhs.value[0];                        \
+    }
+
+    DEFINE_OPERATOR(==)
+    DEFINE_OPERATOR(!=)
+    DEFINE_OPERATOR(<)
+    DEFINE_OPERATOR(<=)
+    DEFINE_OPERATOR(>)
+    DEFINE_OPERATOR(>=)
+    DEFINE_OPERATOR(&&)
+    DEFINE_OPERATOR(||);
+
+
+#undef DEFINE_OPERATOR
+
+    friend std::ostream & operator<<(std::ostream & os, const custom_numeric & val)
+    {
+        return os << "custom_numeric{" << val.value[0] << "}";
+    }
+
+private:
+    int value[5];
+
+    __host__ __device__
+    void fill(int val)
+    {
+        for (int i = 0; i < 5; ++i)
+        {
+            value[i] = val;
+        }
+    }
+};
+
+THRUST_NAMESPACE_BEGIN
+
+template <>
+struct numeric_limits<custom_numeric> : numeric_limits<int> {};
+
+namespace detail
+{
+
+// For random number generation
+template<>
+class integer_traits<custom_numeric>
+  : public integer_traits_base<int, INT_MIN, INT_MAX>
+{};
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
+
 typedef unittest::type_list<char,
                             signed char,
                             unsigned char,
@@ -71,9 +258,23 @@ typedef unittest::type_list<char,
                             unsigned long,
                             long long,
                             unsigned long long,
-                            float> NumericTypes;
-// exclude double from NumericTypes
+                            float,
+                            double,
+                            custom_numeric> NumericTypes;
 
+typedef unittest::type_list<char,
+                            signed char,
+                            unsigned char,
+                            short,
+                            unsigned short,
+                            int,
+                            unsigned int,
+                            long,
+                            unsigned long,
+                            long long,
+                            unsigned long long,
+                            float,
+                            double> BuiltinNumericTypes;
 
 inline void chop_prefix(std::string& str, const std::string& prefix)
 {
@@ -83,17 +284,20 @@ inline void chop_prefix(std::string& str, const std::string& prefix)
 inline std::string base_class_name(const std::string& name)
 {
   std::string result = name;
-  
+
   // if the name begins with "struct ", chop it off
   chop_prefix(result, "struct ");
-  
+
   // if the name begins with "class ", chop it off
   chop_prefix(result, "class ");
 
-  // chop everything including and after first "<"
-  return result.replace(result.find_first_of("<"),
-                        result.size(),
-                        "");
+  const std::size_t first_lt = result.find_first_of("<");
+
+  if (first_lt < result.size())
+      // chop everything including and after first "<"
+      return result.replace(first_lt, result.size(), "");
+  else
+      return result;
 }
 
 enum TestStatus { Pass = 0, Failure = 1, KnownFailure = 2, Error = 3, UnknownException = 4};
@@ -112,7 +316,7 @@ class UnitTest {
         virtual ~UnitTest() {}
         virtual void run() {}
 
-        bool operator<(const UnitTest& u) const 
+        bool operator<(const UnitTest& u) const
         {
             return name < u.name;
         }
@@ -133,19 +337,18 @@ class UnitTestDriver
   // \param test The UnitTest of interest
   // \param concise Whether or not to suppress output
   // \return true if all is well; false if the tests must be immediately aborted
-  virtual bool post_test_sanity_check(const UnitTest &test, bool concise);
+  virtual bool post_test_smoke_check(const UnitTest &test, bool concise);
 
 public:
   inline virtual ~UnitTestDriver() {};
 
   void register_test(UnitTest * test);
   virtual bool run_tests(const ArgumentSet& args, const ArgumentMap& kwargs);
-  void list_tests(void); 
+  void list_tests(void);
 
   static UnitTestDriver &s_driver();
 };
 
-
 // Macro to create a single unittest
 #define DECLARE_UNITTEST(TEST)                                   \
 class TEST##UnitTest : public UnitTest {                         \
@@ -157,16 +360,108 @@ class TEST##UnitTest : public UnitTest {                         \
 };                                                               \
 TEST##UnitTest TEST##Instance
 
+#define DECLARE_UNITTEST_WITH_NAME(TEST, NAME)                   \
+class NAME##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    NAME##UnitTest() : UnitTest(#NAME) {}                        \
+    void run(){                                                  \
+        TEST();                                                  \
+    }                                                            \
+};                                                               \
+NAME##UnitTest NAME##Instance
+
 // Macro to create host and device versions of a
-// unit test for a couple data types
-#define DECLARE_VECTOR_UNITTEST(VTEST)                                                                            \
-void VTEST##Host(void)   {  VTEST< thrust::host_vector<short> >();   VTEST< thrust::host_vector<int> >();   }    \
-void VTEST##Device(void) {  VTEST< thrust::device_vector<short> >(); VTEST< thrust::device_vector<int> >(); }    \
-DECLARE_UNITTEST(VTEST##Host);                                                                                    \
-DECLARE_UNITTEST(VTEST##Device);
-
-// Macro to create instances of a test for several 
-// data types and array sizes
+// unit test for a bunch of data types
+#define DECLARE_VECTOR_UNITTEST(VTEST)                          \
+void VTEST##Host(void) {                                        \
+    VTEST< thrust::host_vector<signed char> >();                \
+    VTEST< thrust::host_vector<short> >();                      \
+    VTEST< thrust::host_vector<int> >();                        \
+    VTEST< thrust::host_vector<float> >();                      \
+    VTEST< thrust::host_vector<custom_numeric> >();             \
+    /* MR vectors */                                            \
+    VTEST< thrust::host_vector<int,                             \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::host_memory_resource> > >();                \
+}                                                               \
+void VTEST##Device(void) {                                      \
+    VTEST< thrust::device_vector<signed char> >();              \
+    VTEST< thrust::device_vector<short> >();                    \
+    VTEST< thrust::device_vector<int> >();                      \
+    VTEST< thrust::device_vector<float> >();                    \
+    VTEST< thrust::device_vector<custom_numeric> >();           \
+    /* MR vectors */                                            \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::device_memory_resource> > >();              \
+}                                                               \
+void VTEST##Universal(void) {                                   \
+    VTEST< thrust::universal_vector<int> >();                   \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::universal_host_pinned_memory_resource> > >();\
+}                                                               \
+DECLARE_UNITTEST(VTEST##Host);                                  \
+DECLARE_UNITTEST(VTEST##Device);                                \
+DECLARE_UNITTEST(VTEST##Universal);
+
+// Same as above, but only for integral types
+#define DECLARE_INTEGRAL_VECTOR_UNITTEST(VTEST)                 \
+void VTEST##Host(void) {                                        \
+    VTEST< thrust::host_vector<signed char> >();                \
+    VTEST< thrust::host_vector<short> >();                      \
+    VTEST< thrust::host_vector<int> >();                        \
+}                                                               \
+void VTEST##Device(void) {                                      \
+    VTEST< thrust::device_vector<signed char> >();              \
+    VTEST< thrust::device_vector<short> >();                    \
+    VTEST< thrust::device_vector<int> >();                      \
+}                                                               \
+void VTEST##Universal(void) {                                   \
+    VTEST< thrust::universal_vector<int> >();                   \
+    VTEST< thrust::device_vector<int,                           \
+        thrust::mr::stateless_resource_allocator<int,           \
+            thrust::universal_host_pinned_memory_resource> > >();\
+}                                                               \
+DECLARE_UNITTEST(VTEST##Host);                                  \
+DECLARE_UNITTEST(VTEST##Device);                                \
+DECLARE_UNITTEST(VTEST##Universal);
+
+// Macro to create instances of a test for several data types.
+#define DECLARE_GENERIC_UNITTEST(TEST)                           \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        TEST<signed char>();                                     \
+        TEST<unsigned char>();                                   \
+        TEST<short>();                                           \
+        TEST<unsigned short>();                                  \
+        TEST<int>();                                             \
+        TEST<unsigned int>();                                    \
+        TEST<float>();                                           \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+// Macro to create instances of a test for several array sizes.
+#define DECLARE_SIZED_UNITTEST(TEST)                             \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        std::vector<size_t> sizes = get_test_sizes();            \
+        for(size_t i = 0; i != sizes.size(); ++i)                \
+        {                                                        \
+            TEST(sizes[i]);                                      \
+        }                                                        \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+// Macro to create instances of a test for several data types and array sizes
 #define DECLARE_VARIABLE_UNITTEST(TEST)                          \
 class TEST##UnitTest : public UnitTest {                         \
     public:                                                      \
@@ -176,18 +471,55 @@ class TEST##UnitTest : public UnitTest {                         \
         std::vector<size_t> sizes = get_test_sizes();            \
         for(size_t i = 0; i != sizes.size(); ++i)                \
         {                                                        \
-            TEST<char>(sizes[i]);                                \
+            TEST<signed char>(sizes[i]);                         \
             TEST<unsigned char>(sizes[i]);                       \
             TEST<short>(sizes[i]);                               \
             TEST<unsigned short>(sizes[i]);                      \
             TEST<int>(sizes[i]);                                 \
             TEST<unsigned int>(sizes[i]);                        \
             TEST<float>(sizes[i]);                               \
+            TEST<double>(sizes[i]);                              \
+        }                                                        \
+    }                                                            \
+};                                                               \
+TEST##UnitTest TEST##Instance
+
+#define DECLARE_INTEGRAL_VARIABLE_UNITTEST(TEST)                 \
+class TEST##UnitTest : public UnitTest {                         \
+    public:                                                      \
+    TEST##UnitTest() : UnitTest(#TEST) {}                        \
+    void run()                                                   \
+    {                                                            \
+        std::vector<size_t> sizes = get_test_sizes();            \
+        for(size_t i = 0; i != sizes.size(); ++i)                \
+        {                                                        \
+            TEST<signed char>(sizes[i]);                         \
+            TEST<unsigned char>(sizes[i]);                       \
+            TEST<short>(sizes[i]);                               \
+            TEST<unsigned short>(sizes[i]);                      \
+            TEST<int>(sizes[i]);                                 \
+            TEST<unsigned int>(sizes[i]);                        \
         }                                                        \
     }                                                            \
 };                                                               \
 TEST##UnitTest TEST##Instance
 
+#define DECLARE_GENERIC_UNITTEST_WITH_TYPES_AND_NAME(TEST, TYPES, NAME)       \
+  ::SimpleUnitTest<TEST, TYPES> NAME##_instance(#NAME)                        \
+  /**/
+
+#define DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES_AND_NAME(TEST, TYPES, NAME) \
+  ::VariableUnitTest<TEST, TYPES> NAME##_instance(#NAME)                      \
+  /**/
+
+#define DECLARE_GENERIC_UNITTEST_WITH_TYPES(TEST, TYPES)                      \
+  ::SimpleUnitTest<TEST, TYPES> TEST##_instance(#TEST)                        \
+  /**/
+
+#define DECLARE_GENERIC_SIZED_UNITTEST_WITH_TYPES(TEST, TYPES)                \
+  ::VariableUnitTest<TEST, TYPES> TEST##_instance(#TEST)                      \
+  /**/
+
 template<template <typename> class TestName, typename TypeList>
   class SimpleUnitTest : public UnitTest
 {
@@ -195,6 +527,9 @@ template<template <typename> class TestName, typename TypeList>
     SimpleUnitTest()
       : UnitTest(base_class_name(unittest::type_name<TestName<int> >()).c_str()) {}
 
+    SimpleUnitTest(const char * name)
+      : UnitTest(name) {}
+
     void run()
     {
       // get the first type in the list
@@ -215,11 +550,14 @@ template<template <typename> class TestName, typename TypeList>
     VariableUnitTest()
       : UnitTest(base_class_name(unittest::type_name<TestName<int> >()).c_str()) {}
 
+    VariableUnitTest(const char * name)
+      : UnitTest(name) {}
+
     void run()
     {
         std::vector<size_t> sizes = get_test_sizes();
         for(size_t i = 0; i != sizes.size(); ++i)
-        {                                                 
+        {
             // get the first type in the list
             typedef typename unittest::get_type<TypeList,0>::type first_type;
 
@@ -227,7 +565,7 @@ template<template <typename> class TestName, typename TypeList>
 
             // loop over the types
             loop(sizes[i]);
-        }                                                 
+        }
     }
 }; // end VariableUnitTest
 
@@ -239,10 +577,13 @@ template<template <typename> class TestName,
     : public UnitTest
 {
   VectorUnitTest()
-    : UnitTest((base_class_name(unittest::type_name<TestName< Vector<int, Alloc<int> > > >()) + "<" + 
+    : UnitTest((base_class_name(unittest::type_name<TestName< Vector<int, Alloc<int> > > >()) + "<" +
                 base_class_name(unittest::type_name<Vector<int, Alloc<int> > >()) + ">").c_str())
   { }
 
+  VectorUnitTest(const char * name)
+    : UnitTest(name) {}
+
   void run()
   {
     // zip up the type list with Alloc
diff --git a/testing/unittest/util.h b/testing/unittest/util.h
index db3da5659..986f80c7b 100644
--- a/testing/unittest/util.h
+++ b/testing/unittest/util.h
@@ -5,6 +5,10 @@
 #include <typeinfo>
 #include <unittest/system.h>
 
+#include <thrust/extrema.h>
+#include <thrust/limits.h>
+#include <thrust/detail/type_traits.h>
+
 namespace unittest
 {
 
@@ -14,6 +18,31 @@ template<typename T>
   return demangle(typeid(T).name());
 } // end type_name()
 
+// Use this with counting_iterator to avoid generating a range larger than we
+// can represent.
+template <typename T>
+typename THRUST_NS_QUALIFIER::detail::disable_if<
+  THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
+, T
+>::type truncate_to_max_representable(std::size_t n)
+{
+  return static_cast<T>(THRUST_NS_QUALIFIER::min<std::size_t>(
+    n,
+    static_cast<std::size_t>(THRUST_NS_QUALIFIER::numeric_limits<T>::max())));
+}
+
+// TODO: This probably won't work for `half`.
+template <typename T>
+typename THRUST_NS_QUALIFIER::detail::enable_if<
+  THRUST_NS_QUALIFIER::detail::is_floating_point<T>::value
+, T
+>::type truncate_to_max_representable(std::size_t n)
+{
+  return THRUST_NS_QUALIFIER::min<T>(
+    static_cast<T>(n),
+    THRUST_NS_QUALIFIER::numeric_limits<T>::max());
+}
+
 } // end unittest
 
 template <typename Iterator>
diff --git a/testing/unittest/util_async.h b/testing/unittest/util_async.h
new file mode 100644
index 000000000..9a3454efd
--- /dev/null
+++ b/testing/unittest/util_async.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <unittest/unittest.h>
+
+#include <thrust/future.h>
+
+#define TEST_EVENT_WAIT(e)                                                    \
+  ::unittest::test_event_wait(e, __FILE__, __LINE__)                          \
+  /**/
+
+#define TEST_FUTURE_VALUE_RETRIEVAL(f)                                        \
+  ::unittest::test_future_value_retrieval(f, __FILE__, __LINE__)              \
+  /**/
+
+namespace unittest
+{
+
+template <typename Event>
+__host__
+void test_event_wait(
+  Event&& e, std::string const& filename = "unknown", int lineno = -1
+)
+{
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno);
+
+  e.wait();
+  e.wait();
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, e.ready(), filename, lineno);
+}
+
+template <typename Future>
+__host__
+auto test_future_value_retrieval(
+  Future&& f, std::string const& filename = "unknown", int lineno = -1
+) -> decltype(f.extract())
+{
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno);
+
+  auto const r0 = f.get();
+  auto const r1 = f.get();
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.ready(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(true, f.valid_content(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r0, r1, filename, lineno);
+
+  auto const r2 = f.extract();
+
+  ASSERT_THROWS_EQUAL_WITH_FILE_AND_LINE(
+    auto x = f.extract();
+    THRUST_UNUSED_VAR(x)
+  , thrust::event_error
+  , thrust::event_error(thrust::event_errc::no_content)
+  , filename, lineno
+  );
+
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.ready(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_stream(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(false, f.valid_content(), filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r2, r1, filename, lineno);
+  ASSERT_EQUAL_WITH_FILE_AND_LINE(r2, r0, filename, lineno);
+
+  return r2;
+}
+
+} // namespace unittest
+
+#endif // THRUST_CPP_DIALECT >= 2014
diff --git a/testing/unittest_static_assert.cmake b/testing/unittest_static_assert.cmake
new file mode 100644
index 000000000..a8a96f2bd
--- /dev/null
+++ b/testing/unittest_static_assert.cmake
@@ -0,0 +1,10 @@
+# Disable unreachable code warnings.
+# This test unconditionally throws in some places, the compiler will detect that
+# control flow will never reach some instructions. This is intentional.
+target_link_libraries(${test_target} PRIVATE thrust.silence_unreachable_code_warnings)
+
+# The machinery behind this test is not compatible with NVC++.
+# See https://github.com/NVIDIA/thrust/issues/1397
+if ("NVCXX" STREQUAL "${CMAKE_CUDA_COMPILER_ID}")
+  set_tests_properties(${test_target} PROPERTIES DISABLED True)
+endif()
diff --git a/testing/unittest_static_assert.cu b/testing/unittest_static_assert.cu
new file mode 100644
index 000000000..7ed0d5658
--- /dev/null
+++ b/testing/unittest_static_assert.cu
@@ -0,0 +1,30 @@
+#include <unittest/runtime_static_assert.h>
+#include <unittest/unittest.h>
+#include <thrust/generate.h>
+
+template<typename T>
+struct dependent_false
+{
+    enum { value = false };
+};
+
+template<typename T>
+struct static_assertion
+{
+    __host__ __device__
+    T operator()() const
+    {
+        THRUST_STATIC_ASSERT(dependent_false<T>::value);
+        return 0;
+    }
+};
+
+template<typename V>
+void TestStaticAssertAssert()
+{
+    using value_type = typename V::value_type;
+    V test(10);
+    ASSERT_STATIC_ASSERT(thrust::generate(test.begin(), test.end(),
+                                          static_assertion<value_type>()));
+}
+DECLARE_VECTOR_UNITTEST(TestStaticAssertAssert);
diff --git a/testing/unittest_tester.cu b/testing/unittest_tester.cu
index 99eb5c881..27e97ca91 100644
--- a/testing/unittest_tester.cu
+++ b/testing/unittest_tester.cu
@@ -22,6 +22,18 @@ void TestAssertGEqual(void)
 }
 DECLARE_UNITTEST(TestAssertGEqual);
 
+void TestAssertLess(void)
+{
+    ASSERT_LESS(0, 1);
+}
+DECLARE_UNITTEST(TestAssertLess);
+
+void TestAssertGreater(void)
+{
+    ASSERT_GREATER(1, 0);
+}
+DECLARE_UNITTEST(TestAssertGreater);
+
 void TestTypeName(void)
 {
     ASSERT_EQUAL(unittest::type_name<char>(),          "char");
diff --git a/testing/universal_memory.cu b/testing/universal_memory.cu
new file mode 100644
index 000000000..18a30fbfe
--- /dev/null
+++ b/testing/universal_memory.cu
@@ -0,0 +1,166 @@
+#include <unittest/unittest.h>
+
+#include <thrust/sequence.h>
+#include <thrust/allocate_unique.h>
+#include <thrust/universal_vector.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <numeric>
+#include <vector>
+
+namespace
+{
+
+// The managed_memory_pointer class should be identified as a
+// contiguous_iterator
+THRUST_STATIC_ASSERT(
+  thrust::is_contiguous_iterator<thrust::universal_allocator<int>::pointer>::value);
+
+template <typename T>
+struct some_object {
+  some_object(T data)
+      : m_data(data)
+  {}
+
+  void setter(T data) { m_data = data; }
+  T getter() const { return m_data; }
+
+private:
+  T m_data;
+};
+
+} // namespace
+
+template <typename T>
+void TestUniversalAllocateUnique()
+{
+  // Simple test to ensure that pointers created with universal_memory_resource
+  // can be dereferenced and used with STL code. This is necessary as some
+  // STL implementations break when using fancy references that overload
+  // operator&, so universal_memory_resource uses a special pointer type that
+  // returns regular C++ references that can be safely used host-side.
+
+  // These operations fail to compile with fancy references:
+  auto raw = thrust::allocate_unique<T>(thrust::universal_allocator<T>{}, 42);
+  auto obj = thrust::allocate_unique<some_object<T>>(
+    thrust::universal_allocator<some_object<T> >{}, 42
+  );
+
+  static_assert(
+    std::is_same<decltype(raw.get()),
+                 thrust::universal_ptr<T> >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+  static_assert(
+    std::is_same<decltype(obj.get()),
+                 thrust::universal_ptr<some_object<T> > >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  ASSERT_EQUAL(*raw, T(42));
+  ASSERT_EQUAL(*raw.get(), T(42));
+  ASSERT_EQUAL(obj->getter(), T(42));
+  ASSERT_EQUAL((*obj).getter(), T(42));
+  ASSERT_EQUAL(obj.get()->getter(), T(42));
+  ASSERT_EQUAL((*obj.get()).getter(), T(42));
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalAllocateUnique);
+
+template <typename T>
+void TestUniversalIterationRaw()
+{
+  auto array = thrust::allocate_unique_n<T>(
+    thrust::universal_allocator<T>{}, 6, 42);
+
+  static_assert(
+    std::is_same<decltype(array.get()), thrust::universal_ptr<T> >::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(*iter, T(42));
+    ASSERT_EQUAL(*iter.get(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalIterationRaw);
+
+template <typename T>
+void TestUniversalIterationObj()
+{
+  auto array = thrust::allocate_unique_n<some_object<T>>(
+    thrust::universal_allocator<some_object<T>>{}, 6, 42);
+
+  static_assert(
+    std::is_same<decltype(array.get()),
+                 thrust::universal_ptr<some_object<T>>>::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  for (auto iter = array.get(), end = array.get() + 6; iter < end; ++iter)
+  {
+    ASSERT_EQUAL(iter->getter(), T(42));
+    ASSERT_EQUAL((*iter).getter(), T(42));
+    ASSERT_EQUAL(iter.get()->getter(), T(42));
+    ASSERT_EQUAL((*iter.get()).getter(), T(42));
+  }
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalIterationObj);
+
+template <typename T>
+void TestUniversalRawPointerCast()
+{
+  auto obj = thrust::allocate_unique<T>(thrust::universal_allocator<T>{}, 42);
+
+  static_assert(
+    std::is_same<decltype(obj.get()), thrust::universal_ptr<T>>::value,
+    "Unexpected pointer type returned from std::unique_ptr::get.");
+
+  static_assert(
+    std::is_same<decltype(thrust::raw_pointer_cast(obj.get())), T*>::value,
+    "Unexpected pointer type returned from thrust::raw_pointer_cast.");
+
+  *thrust::raw_pointer_cast(obj.get()) = T(17);
+
+  ASSERT_EQUAL(*obj, T(17));
+}
+DECLARE_GENERIC_UNITTEST(TestUniversalRawPointerCast);
+
+template <typename T>
+void TestUniversalThrustVector(std::size_t const n)
+{
+  thrust::host_vector<T>      host(n);
+  thrust::universal_vector<T> universal(n);
+
+  static_assert(
+    std::is_same<typename std::decay<decltype(universal)>::type::pointer,
+                 thrust::universal_ptr<T>>::value,
+    "Unexpected thrust::universal_vector pointer type.");
+
+  thrust::sequence(host.begin(), host.end(), 0);
+  thrust::sequence(universal.begin(), universal.end(), 0);
+
+  ASSERT_EQUAL(host.size(), n);
+  ASSERT_EQUAL(universal.size(), n);
+  ASSERT_EQUAL(host, universal);
+}
+DECLARE_VARIABLE_UNITTEST(TestUniversalThrustVector);
+
+// Verify that a std::vector using the universal allocator will work with
+// Standard Library algorithms.
+template <typename T>
+void TestUniversalStdVector(std::size_t const n)
+{
+  std::vector<T>                                 host(n);
+  std::vector<T, thrust::universal_allocator<T>> universal(n);
+
+  static_assert(
+    std::is_same<typename std::decay<decltype(universal)>::type::pointer,
+                 thrust::universal_ptr<T>>::value,
+    "Unexpected std::vector pointer type.");
+
+  std::iota(host.begin(), host.end(), 0);
+  std::iota(universal.begin(), universal.end(), 0);
+
+  ASSERT_EQUAL(host.size(), n);
+  ASSERT_EQUAL(universal.size(), n);
+  ASSERT_EQUAL(host, universal);
+}
+DECLARE_VARIABLE_UNITTEST(TestUniversalStdVector);
+
diff --git a/testing/vector.cu b/testing/vector.cu
index d99bcfd30..b09a4b55c 100644
--- a/testing/vector.cu
+++ b/testing/vector.cu
@@ -1,16 +1,22 @@
 #include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
 #include <thrust/sequence.h>
 #include <thrust/device_malloc_allocator.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#include <initializer_list>
+#endif
 #include <vector>
 #include <list>
 #include <limits>
-
+#include <utility>
 
 template <class Vector>
 void TestVectorZeroSize(void)
 {
     Vector v;
-    ASSERT_EQUAL(v.size(), 0);
+    ASSERT_EQUAL(v.size(), 0lu);
     ASSERT_EQUAL((v.begin() == v.end()), true);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorZeroSize);
@@ -34,6 +40,30 @@ void TestVectorBool(void)
 }
 DECLARE_UNITTEST(TestVectorBool);
 
+template <class Vector>
+void TestVectorInitializerList(void)
+{
+    Vector v{1, 2, 3};
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 2);
+    ASSERT_EQUAL(v[2], 3);
+
+    v = {1, 2, 3, 4};
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 2);
+    ASSERT_EQUAL(v[2], 3);
+    ASSERT_EQUAL(v[3], 4);
+    
+    const auto alloc = v.get_allocator();
+    Vector v2{{1, 2, 3}, alloc};
+    ASSERT_EQUAL(v2.size(), 3lu);
+    ASSERT_EQUAL(v2[0], 1);
+    ASSERT_EQUAL(v2[1], 2);
+    ASSERT_EQUAL(v2[2], 3);
+}
+DECLARE_VECTOR_UNITTEST(TestVectorInitializerList);
 
 template <class Vector>
 void TestVectorFrontBack(void)
@@ -43,8 +73,8 @@ void TestVectorFrontBack(void)
     Vector v(3);
     v[0] = 0; v[1] = 1; v[2] = 2;
 
-    ASSERT_EQUAL(v.front(), 0);
-    ASSERT_EQUAL(v.back(),  2);
+    ASSERT_EQUAL(v.front(), T(0));
+    ASSERT_EQUAL(v.back(),  T(2));
 }
 DECLARE_VECTOR_UNITTEST(TestVectorFrontBack);
 
@@ -52,7 +82,8 @@ DECLARE_VECTOR_UNITTEST(TestVectorFrontBack);
 template <class Vector>
 void TestVectorData(void)
 {
-    typedef typename Vector::value_type T;
+    typedef typename Vector::pointer PointerT;
+    typedef typename Vector::const_pointer PointerConstT;
 
     Vector v(3);
     v[0] = 0; v[1] = 1; v[2] = 2;
@@ -60,18 +91,18 @@ void TestVectorData(void)
     ASSERT_EQUAL(0,          *v.data());
     ASSERT_EQUAL(1,          *(v.data() + 1));
     ASSERT_EQUAL(2,          *(v.data() + 2));
-    ASSERT_EQUAL(&v.front(),  v.data());
-    ASSERT_EQUAL(&*v.begin(), v.data());
-    ASSERT_EQUAL(&v[0],       v.data());
+    ASSERT_EQUAL(PointerT(&v.front()),  v.data());
+    ASSERT_EQUAL(PointerT(&*v.begin()), v.data());
+    ASSERT_EQUAL(PointerT(&v[0]),       v.data());
 
     const Vector &c_v = v;
 
     ASSERT_EQUAL(0,            *c_v.data());
     ASSERT_EQUAL(1,            *(c_v.data() + 1));
     ASSERT_EQUAL(2,            *(c_v.data() + 2));
-    ASSERT_EQUAL(&c_v.front(),  c_v.data());
-    ASSERT_EQUAL(&*c_v.begin(), c_v.data());
-    ASSERT_EQUAL(&c_v[0],       c_v.data());
+    ASSERT_EQUAL(PointerConstT(&c_v.front()),  c_v.data());
+    ASSERT_EQUAL(PointerConstT(&*c_v.begin()), c_v.data());
+    ASSERT_EQUAL(PointerConstT(&c_v[0]),       c_v.data());
 }
 DECLARE_VECTOR_UNITTEST(TestVectorData);
 
@@ -79,8 +110,6 @@ DECLARE_VECTOR_UNITTEST(TestVectorData);
 template <class Vector>
 void TestVectorElementAssignment(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(3);
 
     v[0] = 0; v[1] = 1; v[2] = 2;
@@ -117,14 +146,14 @@ void TestVectorFromSTLVector(void)
 
     thrust::host_vector<T> v(stl_vector);
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 2);
 
     v = stl_vector;
-    
-    ASSERT_EQUAL(v.size(), 3);
+
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 2);
@@ -140,7 +169,7 @@ void TestVectorFillAssign(void)
     thrust::host_vector<T> v;
     v.assign(3, 13);
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 13);
     ASSERT_EQUAL(v[1], 13);
     ASSERT_EQUAL(v[2], 13);
@@ -161,7 +190,7 @@ void TestVectorAssignFromSTLVector(void)
     thrust::host_vector<T> v;
     v.assign(stl_vector.begin(), stl_vector.end());
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 2);
@@ -173,15 +202,15 @@ template <class Vector>
 void TestVectorFromBiDirectionalIterator(void)
 {
     typedef typename Vector::value_type T;
-    
+
     std::list<T> stl_list;
     stl_list.push_back(0);
     stl_list.push_back(1);
     stl_list.push_back(2);
 
-    thrust::host_vector<int> v(stl_list.begin(), stl_list.end());
+    Vector v(stl_list.begin(), stl_list.end());
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 2);
@@ -193,7 +222,7 @@ template <class Vector>
 void TestVectorAssignFromBiDirectionalIterator(void)
 {
     typedef typename Vector::value_type T;
-    
+
     std::list<T> stl_list;
     stl_list.push_back(0);
     stl_list.push_back(1);
@@ -202,7 +231,7 @@ void TestVectorAssignFromBiDirectionalIterator(void)
     Vector v;
     v.assign(stl_list.begin(), stl_list.end());
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
     ASSERT_EQUAL(v[2], 2);
@@ -242,7 +271,7 @@ void TestVectorToAndFromHostVector(void)
 
     ASSERT_EQUAL(v, h);
 
-    v = v;
+    THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(v = v);
 
     ASSERT_EQUAL(v, h);
 
@@ -250,7 +279,7 @@ void TestVectorToAndFromHostVector(void)
     v[1] = 11;
     v[2] = 12;
 
-    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10); 
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
     ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
 
@@ -298,8 +327,8 @@ void TestVectorToAndFromDeviceVector(void)
     Vector v(h);
 
     ASSERT_EQUAL(v, h);
-    
-    v = v;
+
+    THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(v = v);
 
     ASSERT_EQUAL(v, h);
 
@@ -307,7 +336,7 @@ void TestVectorToAndFromDeviceVector(void)
     v[1] = 11;
     v[2] = 12;
 
-    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10); 
+    ASSERT_EQUAL(h[0], 0);  ASSERT_EQUAL(v[0], 10);
     ASSERT_EQUAL(h[1], 1);  ASSERT_EQUAL(v[1], 11);
     ASSERT_EQUAL(h[2], 2);  ASSERT_EQUAL(v[2], 12);
 
@@ -333,7 +362,7 @@ void TestVectorWithInitialValue(void)
 
     Vector v(3, init);
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
     ASSERT_EQUAL(v[0], init);
     ASSERT_EQUAL(v[1], init);
     ASSERT_EQUAL(v[2], init);
@@ -344,8 +373,6 @@ DECLARE_VECTOR_UNITTEST(TestVectorWithInitialValue);
 template <class Vector>
 void TestVectorSwap(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(3);
     v[0] = 0; v[1] = 1; v[2] = 2;
 
@@ -354,7 +381,7 @@ void TestVectorSwap(void)
 
     v.swap(u);
 
-    ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(u[0], 0);  
+    ASSERT_EQUAL(v[0], 10); ASSERT_EQUAL(u[0], 0);
     ASSERT_EQUAL(v[1], 11); ASSERT_EQUAL(u[1], 1);
     ASSERT_EQUAL(v[2], 12); ASSERT_EQUAL(u[2], 2);
 }
@@ -364,40 +391,38 @@ DECLARE_VECTOR_UNITTEST(TestVectorSwap);
 template <class Vector>
 void TestVectorErasePosition(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(5);
     v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4;
 
     v.erase(v.begin() + 2);
 
-    ASSERT_EQUAL(v.size(), 4); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 1); 
-    ASSERT_EQUAL(v[2], 3); 
-    ASSERT_EQUAL(v[3], 4); 
-    
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 1);
+    ASSERT_EQUAL(v[2], 3);
+    ASSERT_EQUAL(v[3], 4);
+
     v.erase(v.begin() + 0);
 
-    ASSERT_EQUAL(v.size(), 3); 
-    ASSERT_EQUAL(v[0], 1); 
-    ASSERT_EQUAL(v[1], 3); 
-    ASSERT_EQUAL(v[2], 4); 
-    
+    ASSERT_EQUAL(v.size(), 3lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+
     v.erase(v.begin() + 2);
 
-    ASSERT_EQUAL(v.size(), 2); 
-    ASSERT_EQUAL(v[0], 1); 
-    ASSERT_EQUAL(v[1], 3); 
-    
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 1);
+    ASSERT_EQUAL(v[1], 3);
+
     v.erase(v.begin() + 1);
 
-    ASSERT_EQUAL(v.size(), 1); 
-    ASSERT_EQUAL(v[0], 1); 
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 1);
 
     v.erase(v.begin() + 0);
 
-    ASSERT_EQUAL(v.size(), 0); 
+    ASSERT_EQUAL(v.size(), 0lu);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorErasePosition);
 
@@ -405,33 +430,31 @@ DECLARE_VECTOR_UNITTEST(TestVectorErasePosition);
 template <class Vector>
 void TestVectorEraseRange(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v(6);
     v[0] = 0; v[1] = 1; v[2] = 2; v[3] = 3; v[4] = 4; v[5] = 5;
 
     v.erase(v.begin() + 1, v.begin() + 3);
 
-    ASSERT_EQUAL(v.size(), 4); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 3); 
-    ASSERT_EQUAL(v[2], 4); 
-    ASSERT_EQUAL(v[3], 5); 
-    
+    ASSERT_EQUAL(v.size(), 4lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+    ASSERT_EQUAL(v[2], 4);
+    ASSERT_EQUAL(v[3], 5);
+
     v.erase(v.begin() + 2, v.end());
 
-    ASSERT_EQUAL(v.size(), 2); 
-    ASSERT_EQUAL(v[0], 0); 
-    ASSERT_EQUAL(v[1], 3); 
-    
+    ASSERT_EQUAL(v.size(), 2lu);
+    ASSERT_EQUAL(v[0], 0);
+    ASSERT_EQUAL(v[1], 3);
+
     v.erase(v.begin() + 0, v.begin() + 1);
 
-    ASSERT_EQUAL(v.size(), 1); 
-    ASSERT_EQUAL(v[0], 3); 
-    
+    ASSERT_EQUAL(v.size(), 1lu);
+    ASSERT_EQUAL(v[0], 3);
+
     v.erase(v.begin(), v.end());
 
-    ASSERT_EQUAL(v.size(), 0); 
+    ASSERT_EQUAL(v.size(), 0lu);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorEraseRange);
 
@@ -459,21 +482,21 @@ void TestVectorEquality(void)
     s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
     s_b[0] = 0;    s_b[1] = 1;
 
-    ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true);  ASSERT_EQUAL((d_a == d_a), true); 
+    ASSERT_EQUAL((h_a == h_a), true); ASSERT_EQUAL((h_a == d_a), true); ASSERT_EQUAL((d_a == h_a), true);  ASSERT_EQUAL((d_a == d_a), true);
     ASSERT_EQUAL((h_b == h_b), true); ASSERT_EQUAL((h_b == d_b), true); ASSERT_EQUAL((d_b == h_b), true);  ASSERT_EQUAL((d_b == d_b), true);
     ASSERT_EQUAL((h_c == h_c), true); ASSERT_EQUAL((h_c == d_c), true); ASSERT_EQUAL((d_c == h_c), true);  ASSERT_EQUAL((d_c == d_c), true);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a == d_a), true); ASSERT_EQUAL((d_a == s_a), true); 
+    ASSERT_EQUAL((s_a == d_a), true); ASSERT_EQUAL((d_a == s_a), true);
     ASSERT_EQUAL((s_b == d_b), true); ASSERT_EQUAL((d_b == s_b), true);
     ASSERT_EQUAL((s_c == d_c), true); ASSERT_EQUAL((d_c == s_c), true);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a == h_a), true); ASSERT_EQUAL((h_a == s_a), true); 
+    ASSERT_EQUAL((s_a == h_a), true); ASSERT_EQUAL((h_a == s_a), true);
     ASSERT_EQUAL((s_b == h_b), true); ASSERT_EQUAL((h_b == s_b), true);
     ASSERT_EQUAL((s_c == h_c), true); ASSERT_EQUAL((h_c == s_c), true);
 
-    ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false); 
+    ASSERT_EQUAL((h_a == h_b), false); ASSERT_EQUAL((h_a == d_b), false); ASSERT_EQUAL((d_a == h_b), false); ASSERT_EQUAL((d_a == d_b), false);
     ASSERT_EQUAL((h_b == h_a), false); ASSERT_EQUAL((h_b == d_a), false); ASSERT_EQUAL((d_b == h_a), false); ASSERT_EQUAL((d_b == d_a), false);
     ASSERT_EQUAL((h_a == h_c), false); ASSERT_EQUAL((h_a == d_c), false); ASSERT_EQUAL((d_a == h_c), false); ASSERT_EQUAL((d_a == d_c), false);
     ASSERT_EQUAL((h_c == h_a), false); ASSERT_EQUAL((h_c == d_a), false); ASSERT_EQUAL((d_c == h_a), false); ASSERT_EQUAL((d_c == d_a), false);
@@ -481,7 +504,7 @@ void TestVectorEquality(void)
     ASSERT_EQUAL((h_c == h_b), false); ASSERT_EQUAL((h_c == d_b), false); ASSERT_EQUAL((d_c == h_b), false); ASSERT_EQUAL((d_c == d_b), false);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a == d_b), false); ASSERT_EQUAL((d_a == s_b), false); 
+    ASSERT_EQUAL((s_a == d_b), false); ASSERT_EQUAL((d_a == s_b), false);
     ASSERT_EQUAL((s_b == d_a), false); ASSERT_EQUAL((d_b == s_a), false);
     ASSERT_EQUAL((s_a == d_c), false); ASSERT_EQUAL((d_a == s_c), false);
     ASSERT_EQUAL((s_c == d_a), false); ASSERT_EQUAL((d_c == s_a), false);
@@ -489,7 +512,7 @@ void TestVectorEquality(void)
     ASSERT_EQUAL((s_c == d_b), false); ASSERT_EQUAL((d_c == s_b), false);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a == h_b), false); ASSERT_EQUAL((h_a == s_b), false); 
+    ASSERT_EQUAL((s_a == h_b), false); ASSERT_EQUAL((h_a == s_b), false);
     ASSERT_EQUAL((s_b == h_a), false); ASSERT_EQUAL((h_b == s_a), false);
     ASSERT_EQUAL((s_a == h_c), false); ASSERT_EQUAL((h_a == s_c), false);
     ASSERT_EQUAL((s_c == h_a), false); ASSERT_EQUAL((h_c == s_a), false);
@@ -521,21 +544,21 @@ void TestVectorInequality(void)
     s_b[0] = 0;    s_b[1] = 1;    s_b[2] = 3;
     s_b[0] = 0;    s_b[1] = 1;
 
-    ASSERT_EQUAL((h_a != h_a), false); ASSERT_EQUAL((h_a != d_a), false); ASSERT_EQUAL((d_a != h_a), false);  ASSERT_EQUAL((d_a != d_a), false); 
+    ASSERT_EQUAL((h_a != h_a), false); ASSERT_EQUAL((h_a != d_a), false); ASSERT_EQUAL((d_a != h_a), false);  ASSERT_EQUAL((d_a != d_a), false);
     ASSERT_EQUAL((h_b != h_b), false); ASSERT_EQUAL((h_b != d_b), false); ASSERT_EQUAL((d_b != h_b), false);  ASSERT_EQUAL((d_b != d_b), false);
     ASSERT_EQUAL((h_c != h_c), false); ASSERT_EQUAL((h_c != d_c), false); ASSERT_EQUAL((d_c != h_c), false);  ASSERT_EQUAL((d_c != d_c), false);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a != d_a), false); ASSERT_EQUAL((d_a != s_a), false); 
+    ASSERT_EQUAL((s_a != d_a), false); ASSERT_EQUAL((d_a != s_a), false);
     ASSERT_EQUAL((s_b != d_b), false); ASSERT_EQUAL((d_b != s_b), false);
     ASSERT_EQUAL((s_c != d_c), false); ASSERT_EQUAL((d_c != s_c), false);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a != h_a), false); ASSERT_EQUAL((h_a != s_a), false); 
+    ASSERT_EQUAL((s_a != h_a), false); ASSERT_EQUAL((h_a != s_a), false);
     ASSERT_EQUAL((s_b != h_b), false); ASSERT_EQUAL((h_b != s_b), false);
     ASSERT_EQUAL((s_c != h_c), false); ASSERT_EQUAL((h_c != s_c), false);
 
-    ASSERT_EQUAL((h_a != h_b), true); ASSERT_EQUAL((h_a != d_b), true); ASSERT_EQUAL((d_a != h_b), true); ASSERT_EQUAL((d_a != d_b), true); 
+    ASSERT_EQUAL((h_a != h_b), true); ASSERT_EQUAL((h_a != d_b), true); ASSERT_EQUAL((d_a != h_b), true); ASSERT_EQUAL((d_a != d_b), true);
     ASSERT_EQUAL((h_b != h_a), true); ASSERT_EQUAL((h_b != d_a), true); ASSERT_EQUAL((d_b != h_a), true); ASSERT_EQUAL((d_b != d_a), true);
     ASSERT_EQUAL((h_a != h_c), true); ASSERT_EQUAL((h_a != d_c), true); ASSERT_EQUAL((d_a != h_c), true); ASSERT_EQUAL((d_a != d_c), true);
     ASSERT_EQUAL((h_c != h_a), true); ASSERT_EQUAL((h_c != d_a), true); ASSERT_EQUAL((d_c != h_a), true); ASSERT_EQUAL((d_c != d_a), true);
@@ -543,7 +566,7 @@ void TestVectorInequality(void)
     ASSERT_EQUAL((h_c != h_b), true); ASSERT_EQUAL((h_c != d_b), true); ASSERT_EQUAL((d_c != h_b), true); ASSERT_EQUAL((d_c != d_b), true);
 
     // test vector vs device_vector
-    ASSERT_EQUAL((s_a != d_b), true); ASSERT_EQUAL((d_a != s_b), true); 
+    ASSERT_EQUAL((s_a != d_b), true); ASSERT_EQUAL((d_a != s_b), true);
     ASSERT_EQUAL((s_b != d_a), true); ASSERT_EQUAL((d_b != s_a), true);
     ASSERT_EQUAL((s_a != d_c), true); ASSERT_EQUAL((d_a != s_c), true);
     ASSERT_EQUAL((s_c != d_a), true); ASSERT_EQUAL((d_c != s_a), true);
@@ -551,7 +574,7 @@ void TestVectorInequality(void)
     ASSERT_EQUAL((s_c != d_b), true); ASSERT_EQUAL((d_c != s_b), true);
 
     // test vector vs host_vector
-    ASSERT_EQUAL((s_a != h_b), true); ASSERT_EQUAL((h_a != s_b), true); 
+    ASSERT_EQUAL((s_a != h_b), true); ASSERT_EQUAL((h_a != s_b), true);
     ASSERT_EQUAL((s_b != h_a), true); ASSERT_EQUAL((h_b != s_a), true);
     ASSERT_EQUAL((s_a != h_c), true); ASSERT_EQUAL((h_a != s_c), true);
     ASSERT_EQUAL((s_c != h_a), true); ASSERT_EQUAL((h_c != s_a), true);
@@ -564,19 +587,17 @@ DECLARE_UNITTEST(TestVectorInequality);
 template <class Vector>
 void TestVectorResizing(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v;
 
     v.resize(3);
 
-    ASSERT_EQUAL(v.size(), 3);
+    ASSERT_EQUAL(v.size(), 3lu);
 
     v[0] = 0; v[1] = 1; v[2] = 2;
 
     v.resize(5);
 
-    ASSERT_EQUAL(v.size(), 5);
+    ASSERT_EQUAL(v.size(), 5lu);
 
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
@@ -586,7 +607,7 @@ void TestVectorResizing(void)
 
     v.resize(4);
 
-    ASSERT_EQUAL(v.size(), 4);
+    ASSERT_EQUAL(v.size(), 4lu);
 
     ASSERT_EQUAL(v[0], 0);
     ASSERT_EQUAL(v[1], 1);
@@ -595,10 +616,10 @@ void TestVectorResizing(void)
 
     v.resize(0);
 
-    ASSERT_EQUAL(v.size(), 0);
+    ASSERT_EQUAL(v.size(), 0lu);
 
-// TODO remove this WAR      
-#if defined(__CUDACC__) && CUDA_VERSION==3000
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
     // depending on sizeof(T), we will receive one
     // of two possible exceptions
     try
@@ -611,9 +632,9 @@ void TestVectorResizing(void)
       // reset the CUDA error
       cudaGetLastError();
     } // end catch
-#endif // defined(__CUDACC__) && CUDA_VERSION==3000
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
 
-    ASSERT_EQUAL(v.size(), 0);
+    ASSERT_EQUAL(v.size(), 0lu);
 }
 DECLARE_VECTOR_UNITTEST(TestVectorResizing);
 
@@ -622,13 +643,11 @@ DECLARE_VECTOR_UNITTEST(TestVectorResizing);
 template <class Vector>
 void TestVectorReserving(void)
 {
-    typedef typename Vector::value_type T;
-
     Vector v;
 
     v.reserve(3);
 
-    ASSERT_GEQUAL(v.capacity(), 3);
+    ASSERT_GEQUAL(v.capacity(), 3lu);
 
     size_t old_capacity = v.capacity();
 
@@ -636,15 +655,15 @@ void TestVectorReserving(void)
 
     ASSERT_EQUAL(v.capacity(), old_capacity);
 
-// TODO remove this WAR      
-#if defined(__CUDACC__) && CUDA_VERSION==3000
+// TODO remove this WAR
+#if defined(__CUDACC__) && CUDART_VERSION==3000
     try
     {
       v.reserve(std::numeric_limits<size_t>::max());
     }
     catch(std::length_error e) {}
     catch(std::bad_alloc e) {}
-#endif // defined(__CUDACC__) && CUDA_VERSION==3000
+#endif // defined(__CUDACC__) && CUDART_VERSION==3000
 
     ASSERT_EQUAL(v.capacity(), old_capacity);
 }
@@ -652,6 +671,19 @@ DECLARE_VECTOR_UNITTEST(TestVectorReserving)
 
 
 
+template <class Vector>
+void TestVectorUninitialisedCopy(void)
+{
+    thrust::device_vector<int> v;
+    std::vector<int> std_vector;
+
+    v = std_vector;
+
+    ASSERT_EQUAL(v.size(), static_cast<size_t>(0));
+}
+DECLARE_VECTOR_UNITTEST(TestVectorUninitialisedCopy);
+
+
 template <class Vector>
 void TestVectorShrinkToFit(void)
 {
@@ -661,7 +693,7 @@ void TestVectorShrinkToFit(void)
 
     v.reserve(200);
 
-    ASSERT_GEQUAL(v.capacity(), 200);
+    ASSERT_GEQUAL(v.capacity(), 200lu);
 
     v.push_back(1);
     v.push_back(2);
@@ -669,11 +701,11 @@ void TestVectorShrinkToFit(void)
 
     v.shrink_to_fit();
 
-    ASSERT_EQUAL(1, v[0]);
-    ASSERT_EQUAL(2, v[1]);
-    ASSERT_EQUAL(3, v[2]);
-    ASSERT_EQUAL(3, v.size());
-    ASSERT_EQUAL(3, v.capacity());
+    ASSERT_EQUAL(T(1), v[0]);
+    ASSERT_EQUAL(T(2), v[1]);
+    ASSERT_EQUAL(T(3), v[2]);
+    ASSERT_EQUAL(3lu, v.size());
+    ASSERT_EQUAL(3lu, v.capacity());
 }
 DECLARE_VECTOR_UNITTEST(TestVectorShrinkToFit)
 
@@ -694,7 +726,7 @@ struct LargeStruct
 
 void TestVectorContainingLargeType(void)
 {
-    // Thrust issue #5 
+    // Thrust issue #5
     // http://code.google.com/p/thrust/issues/detail?id=5
     const static int N = 100;
     typedef LargeStruct<N> T;
@@ -706,9 +738,9 @@ void TestVectorContainingLargeType(void)
 
     thrust::device_vector<T> dv2(20);
     thrust::host_vector<T>   hv2(20);
-    
+
     ASSERT_EQUAL_QUIET(dv2, hv2);
-    
+
     // initialize tofirst element to something nonzero
     T ls;
 
@@ -717,15 +749,15 @@ void TestVectorContainingLargeType(void)
 
     thrust::device_vector<T> dv3(20, ls);
     thrust::host_vector<T>   hv3(20, ls);
-    
+
     ASSERT_EQUAL_QUIET(dv3, hv3);
-    
+
     // change first element
     ls.data[0] = -13;
 
     dv3[2] = ls;
     hv3[2] = ls;
-    
+
     ASSERT_EQUAL_QUIET(dv3, hv3);
 }
 DECLARE_UNITTEST(TestVectorContainingLargeType);
@@ -735,7 +767,6 @@ template <typename Vector>
 void TestVectorReversed(void)
 {
   Vector v(3);
-  typedef typename Vector::value_type T;
   v[0] = 0; v[1] = 1; v[2] = 2;
 
   ASSERT_EQUAL(3, v.rend() - v.rbegin());
@@ -754,3 +785,56 @@ void TestVectorReversed(void)
 }
 DECLARE_VECTOR_UNITTEST(TestVectorReversed);
 
+#if THRUST_CPP_DIALECT >= 2011
+  template <class Vector>
+  void TestVectorMove(void)
+  {
+    //test move construction
+    Vector v1(3);
+    v1[0] = 0; v1[1] = 1; v1[2] = 2;
+
+    const auto ptr1 = v1.data();
+    const auto size1 = v1.size();
+
+    Vector v2(std::move(v1));
+    const auto ptr2 = v2.data();
+    const auto size2 = v2.size();
+
+    // ensure v1 was left empty
+    ASSERT_EQUAL(true, v1.empty());
+
+    // ensure v2 received the data from before
+    ASSERT_EQUAL(v2[0], 0);
+    ASSERT_EQUAL(v2[1], 1);
+    ASSERT_EQUAL(v2[2], 2);
+    ASSERT_EQUAL(size1, size2);
+
+    // ensure v2 received the pointer from before
+    ASSERT_EQUAL(ptr1, ptr2);
+
+    //test move assignment
+    Vector v3(3);
+    v3[0] = 3; v3[1] = 4; v3[2] = 5;
+
+    const auto ptr3 = v3.data();
+    const auto size3 = v3.size();
+
+    v2 = std::move(v3);
+    const auto ptr4 = v2.data();
+    const auto size4 = v2.size();
+
+    // ensure v3 was left empty
+    ASSERT_EQUAL(true, v3.empty());
+
+    // ensure v2 received the data from before
+    ASSERT_EQUAL(v2[0], 3);
+    ASSERT_EQUAL(v2[1], 4);
+    ASSERT_EQUAL(v2[2], 5);
+    ASSERT_EQUAL(size3, size4);
+
+    // ensure v2 received the pointer from before
+    ASSERT_EQUAL(ptr3, ptr4);
+  }
+  DECLARE_VECTOR_UNITTEST(TestVectorMove);
+#endif
+
diff --git a/testing/vector_allocators.cu b/testing/vector_allocators.cu
new file mode 100644
index 000000000..568ea7ff6
--- /dev/null
+++ b/testing/vector_allocators.cu
@@ -0,0 +1,276 @@
+#include <unittest/unittest.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+class stateful_allocator : public BaseAlloc
+{
+  typedef thrust::detail::allocator_traits<BaseAlloc> base_traits;
+
+public:
+    stateful_allocator(int i) : state(i)
+    {
+    }
+
+    ~stateful_allocator() {}
+
+    stateful_allocator(const stateful_allocator &other)
+        : BaseAlloc(other), state(other.state)
+    {
+    }
+
+    stateful_allocator & operator=(const stateful_allocator & other)
+    {
+        state = other.state;
+        return *this;
+    }
+
+#if THRUST_CPP_DIALECT >= 2011
+    stateful_allocator(stateful_allocator && other)
+        : BaseAlloc(std::move(other)), state(other.state)
+    {
+        other.state = 0;
+    }
+
+    stateful_allocator & operator=(stateful_allocator && other)
+    {
+        state = other.state;
+        other.state = 0;
+        return *this;
+    }
+#endif
+
+    static int last_allocated;
+    static int last_deallocated;
+
+    typedef typename base_traits::pointer pointer;
+    typedef typename base_traits::const_pointer const_pointer;
+    typedef typename base_traits::reference reference;
+    typedef typename base_traits::const_reference const_reference;
+
+    pointer allocate(std::size_t size)
+    {
+        BaseAlloc alloc;
+        last_allocated = state;
+        return base_traits::allocate(alloc, size);
+    }
+
+    void deallocate(pointer ptr, std::size_t size)
+    {
+        BaseAlloc alloc;
+        last_deallocated = state;
+        return base_traits::deallocate(alloc, ptr, size);
+    }
+
+    static void construct(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::construct(alloc, ptr);
+    }
+
+    static void destroy(pointer ptr)
+    {
+      BaseAlloc alloc;
+      return base_traits::destroy(alloc, ptr);
+    }
+
+    bool operator==(const stateful_allocator &rhs) const
+    {
+        return state == rhs.state;
+    }
+
+    bool operator!=(const stateful_allocator &rhs) const
+    {
+        return state != rhs.state;
+    }
+
+    friend std::ostream & operator<<(std::ostream &os,
+        const stateful_allocator & alloc)
+    {
+        os << "stateful_alloc(" << alloc.state << ")";
+        return os;
+    }
+
+    typedef thrust::detail::false_type is_always_equal;
+    typedef thrust::detail::true_type propagate_on_container_copy_assignment;
+    typedef thrust::detail::true_type propagate_on_container_move_assignment;
+    typedef thrust::detail::integral_constant<bool, PropagateOnSwap> propagate_on_container_swap;
+
+private:
+    int state;
+};
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+int stateful_allocator<BaseAlloc, PropagateOnSwap>::last_allocated = 0;
+
+template<typename BaseAlloc, bool PropagateOnSwap>
+int stateful_allocator<BaseAlloc, PropagateOnSwap>::last_deallocated = 0;
+
+typedef stateful_allocator<std::allocator<int>, true> host_alloc;
+typedef stateful_allocator<thrust::device_allocator<int>, true> device_alloc;
+
+typedef thrust::host_vector<int, host_alloc> host_vector;
+typedef thrust::device_vector<int, device_alloc> device_vector;
+
+typedef stateful_allocator<std::allocator<int>, false> host_alloc_nsp;
+typedef stateful_allocator<thrust::device_allocator<int>, false> device_alloc_nsp;
+
+typedef thrust::host_vector<int, host_alloc_nsp> host_vector_nsp;
+typedef thrust::device_vector<int, device_alloc_nsp> device_vector_nsp;
+
+template<typename Vector>
+void TestVectorAllocatorConstructors()
+{
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(alloc1);
+    ASSERT_EQUAL(v1.get_allocator(), alloc1);
+
+    Vector v2(10, alloc1);
+    ASSERT_EQUAL(v2.size(), 10u);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    Alloc::last_allocated = 0;
+
+    Vector v3(10, 17, alloc1);
+    ASSERT_EQUAL((v3 == std::vector<int>(10, 17)), true);
+    ASSERT_EQUAL(v3.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    Alloc::last_allocated = 0;
+
+    Vector v4(v3, alloc2);
+    ASSERT_EQUAL((v3 == v4), true);
+    ASSERT_EQUAL(v4.get_allocator(), alloc2);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+    Alloc::last_allocated = 0;
+
+#if THRUST_CPP_DIALECT >= 2011
+    // FIXME: uncomment this after the vector_base(vector_base&&, const Alloc&)
+    // is fixed and implemented
+    // Vector v5(std::move(v3), alloc2);
+    // ASSERT_EQUAL((v4 == v5), true);
+    // ASSERT_EQUAL(v5.get_allocator(), alloc2);
+    // ASSERT_EQUAL(Alloc::last_allocated, 1);
+    // Alloc::last_allocated = 0;
+#endif
+
+    Vector v6(v4.begin(), v4.end(), alloc2);
+    ASSERT_EQUAL((v4 == v6), true);
+    ASSERT_EQUAL(v6.get_allocator(), alloc2);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+}
+
+void TestVectorAllocatorConstructorsHost()
+{
+    TestVectorAllocatorConstructors<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorConstructorsHost);
+
+void TestVectorAllocatorConstructorsDevice()
+{
+    TestVectorAllocatorConstructors<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorConstructorsDevice);
+
+template<typename Vector>
+void TestVectorAllocatorPropagateOnCopyAssignment()
+{
+    ASSERT_EQUAL(thrust::detail::allocator_traits<typename Vector::allocator_type>::propagate_on_container_copy_assignment::value, true);
+
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(10, alloc1);
+    Vector v2(15, alloc2);
+
+    v2 = v1;
+    ASSERT_EQUAL((v1 == v2), true);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 1);
+    ASSERT_EQUAL(Alloc::last_deallocated, 2);
+}
+
+void TestVectorAllocatorPropagateOnCopyAssignmentHost()
+{
+    TestVectorAllocatorPropagateOnCopyAssignment<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnCopyAssignmentHost);
+
+void TestVectorAllocatorPropagateOnCopyAssignmentDevice()
+{
+    TestVectorAllocatorPropagateOnCopyAssignment<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnCopyAssignmentDevice);
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename Vector>
+void TestVectorAllocatorPropagateOnMoveAssignment()
+{
+    typedef typename Vector::allocator_type Alloc;
+    ASSERT_EQUAL(thrust::detail::allocator_traits<typename Vector::allocator_type>::propagate_on_container_copy_assignment::value, true);
+
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    {
+    Vector v1(10, alloc1);
+    Vector v2(15, alloc2);
+
+    v2 = std::move(v1);
+    ASSERT_EQUAL(v2.get_allocator(), alloc1);
+    ASSERT_EQUAL(Alloc::last_allocated, 2);
+    ASSERT_EQUAL(Alloc::last_deallocated, 2);
+    }
+
+    ASSERT_EQUAL(Alloc::last_deallocated, 1);
+}
+
+void TestVectorAllocatorPropagateOnMoveAssignmentHost()
+{
+    TestVectorAllocatorPropagateOnMoveAssignment<host_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnMoveAssignmentHost);
+
+void TestVectorAllocatorPropagateOnMoveAssignmentDevice()
+{
+    TestVectorAllocatorPropagateOnMoveAssignment<device_vector>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnMoveAssignmentDevice);
+#endif
+
+template<typename Vector>
+void TestVectorAllocatorPropagateOnSwap()
+{
+    typedef typename Vector::allocator_type Alloc;
+    Alloc alloc1(1);
+    Alloc alloc2(2);
+
+    Vector v1(10, alloc1);
+    Vector v2(17, alloc1);
+    thrust::swap(v1, v2);
+
+    ASSERT_EQUAL(v1.size(), 17u);
+    ASSERT_EQUAL(v2.size(), 10u);
+
+    Vector v3(15, alloc1);
+    Vector v4(31, alloc2);
+    ASSERT_THROWS(thrust::swap(v3, v4), thrust::detail::allocator_mismatch_on_swap);
+}
+
+void TestVectorAllocatorPropagateOnSwapHost()
+{
+    TestVectorAllocatorPropagateOnSwap<host_vector_nsp>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnSwapHost);
+
+void TestVectorAllocatorPropagateOnSwapDevice()
+{
+    TestVectorAllocatorPropagateOnSwap<device_vector_nsp>();
+}
+DECLARE_UNITTEST(TestVectorAllocatorPropagateOnSwapDevice);
diff --git a/testing/vector_cpp_subset.cpp b/testing/vector_cpp_subset.cpp
deleted file mode 100644
index 5618b36b3..000000000
--- a/testing/vector_cpp_subset.cpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#include <unittest/unittest.h>
-
-template <class Vector>
-void TestVectorCppZeroSize(void)
-{
-    Vector v;
-    ASSERT_EQUAL(v.size(), 0);
-    ASSERT_EQUAL((v.begin() == v.end()), true);
-}
-DECLARE_VECTOR_UNITTEST(TestVectorCppZeroSize);
-
-
diff --git a/testing/vector_insert.cu b/testing/vector_insert.cu
index c32b1d060..a9f674aa0 100644
--- a/testing/vector_insert.cu
+++ b/testing/vector_insert.cu
@@ -5,8 +5,10 @@
 template <class Vector>
 struct TestVectorRangeInsertSimple
 {
-    void operator()(size_t dummy)
+    void operator()(size_t)
     {
+        typedef typename Vector::value_type T;
+
         Vector v1(5);
         thrust::sequence(v1.begin(), v1.end());
 
@@ -27,19 +29,19 @@ struct TestVectorRangeInsertSimple
         v2.insert(v2.begin() + 1,
                   v1.begin(), v1.end());
 
-        ASSERT_EQUAL(0, v2[0]);
+        ASSERT_EQUAL(T(0), v2[0]);
 
-        ASSERT_EQUAL(0, v2[1]);
-        ASSERT_EQUAL(1, v2[2]);
-        ASSERT_EQUAL(2, v2[3]);
-        ASSERT_EQUAL(3, v2[4]);
-        ASSERT_EQUAL(4, v2[5]);
+        ASSERT_EQUAL(T(0), v2[1]);
+        ASSERT_EQUAL(T(1), v2[2]);
+        ASSERT_EQUAL(T(2), v2[3]);
+        ASSERT_EQUAL(T(3), v2[4]);
+        ASSERT_EQUAL(T(4), v2[5]);
 
-        ASSERT_EQUAL(1, v2[6]);
-        ASSERT_EQUAL(2, v2[7]);
+        ASSERT_EQUAL(T(1), v2[6]);
+        ASSERT_EQUAL(T(2), v2[7]);
         
-        ASSERT_EQUAL(8,  v2.size());
-        ASSERT_EQUAL(10, v2.capacity());
+        ASSERT_EQUAL(8lu,  v2.size());
+        ASSERT_EQUAL(10lu, v2.capacity());
 
         // test when insertion range fits inside capacity
         // and the size of the insertion is equal to the number
@@ -58,20 +60,20 @@ struct TestVectorRangeInsertSimple
         v3.insert(v3.begin(),
                   v1.begin(), v1.end());
 
-        ASSERT_EQUAL(0, v3[0]);
-        ASSERT_EQUAL(1, v3[1]);
-        ASSERT_EQUAL(2, v3[2]);
-        ASSERT_EQUAL(3, v3[3]);
-        ASSERT_EQUAL(4, v3[4]);
+        ASSERT_EQUAL(T(0), v3[0]);
+        ASSERT_EQUAL(T(1), v3[1]);
+        ASSERT_EQUAL(T(2), v3[2]);
+        ASSERT_EQUAL(T(3), v3[3]);
+        ASSERT_EQUAL(T(4), v3[4]);
 
-        ASSERT_EQUAL(0, v3[5]);
-        ASSERT_EQUAL(1, v3[6]);
-        ASSERT_EQUAL(2, v3[7]);
-        ASSERT_EQUAL(3, v3[8]);
-        ASSERT_EQUAL(4, v3[9]);
+        ASSERT_EQUAL(T(0), v3[5]);
+        ASSERT_EQUAL(T(1), v3[6]);
+        ASSERT_EQUAL(T(2), v3[7]);
+        ASSERT_EQUAL(T(3), v3[8]);
+        ASSERT_EQUAL(T(4), v3[9]);
 
-        ASSERT_EQUAL(10, v3.size());
-        ASSERT_EQUAL(10, v3.capacity());
+        ASSERT_EQUAL(10lu, v3.size());
+        ASSERT_EQUAL(10lu, v3.capacity());
 
         // test when insertion range fits inside capacity
         // and the size of the insertion is less than the
@@ -90,19 +92,19 @@ struct TestVectorRangeInsertSimple
         v4.insert(v4.begin() + 1,
                   v1.begin(), v1.begin() + 3);
 
-        ASSERT_EQUAL(0, v4[0]);
+        ASSERT_EQUAL(T(0), v4[0]);
 
-        ASSERT_EQUAL(0, v4[1]);
-        ASSERT_EQUAL(1, v4[2]);
-        ASSERT_EQUAL(2, v4[3]);
+        ASSERT_EQUAL(T(0), v4[1]);
+        ASSERT_EQUAL(T(1), v4[2]);
+        ASSERT_EQUAL(T(2), v4[3]);
 
-        ASSERT_EQUAL(1, v4[4]);
-        ASSERT_EQUAL(2, v4[5]);
-        ASSERT_EQUAL(3, v4[6]);
-        ASSERT_EQUAL(4, v4[7]);
+        ASSERT_EQUAL(T(1), v4[4]);
+        ASSERT_EQUAL(T(2), v4[5]);
+        ASSERT_EQUAL(T(3), v4[6]);
+        ASSERT_EQUAL(T(4), v4[7]);
 
-        ASSERT_EQUAL(8, v4.size());
-        ASSERT_EQUAL(10, v4.capacity());
+        ASSERT_EQUAL(8lu, v4.size());
+        ASSERT_EQUAL(10lu, v4.capacity());
 
         // test when insertion range does not fit inside capacity
         Vector v5(5);
@@ -115,20 +117,20 @@ struct TestVectorRangeInsertSimple
         v5.insert(v5.begin() + 1,
                   v1.begin(), v1.end());
 
-        ASSERT_EQUAL(0, v5[0]);
+        ASSERT_EQUAL(T(0), v5[0]);
 
-        ASSERT_EQUAL(0, v5[1]);
-        ASSERT_EQUAL(1, v5[2]);
-        ASSERT_EQUAL(2, v5[3]);
-        ASSERT_EQUAL(3, v5[4]);
-        ASSERT_EQUAL(4, v5[5]);
+        ASSERT_EQUAL(T(0), v5[1]);
+        ASSERT_EQUAL(T(1), v5[2]);
+        ASSERT_EQUAL(T(2), v5[3]);
+        ASSERT_EQUAL(T(3), v5[4]);
+        ASSERT_EQUAL(T(4), v5[5]);
 
-        ASSERT_EQUAL(1, v5[6]);
-        ASSERT_EQUAL(2, v5[7]);
-        ASSERT_EQUAL(3, v5[8]);
-        ASSERT_EQUAL(4, v5[9]);
+        ASSERT_EQUAL(T(1), v5[6]);
+        ASSERT_EQUAL(T(2), v5[7]);
+        ASSERT_EQUAL(T(3), v5[8]);
+        ASSERT_EQUAL(T(4), v5[9]);
 
-        ASSERT_EQUAL(10, v5.size());
+        ASSERT_EQUAL(10lu, v5.size());
     }
 }; // end TestVectorRangeInsertSimple
 VectorUnitTest<TestVectorRangeInsertSimple, NumericTypes, thrust::device_vector, thrust::device_malloc_allocator> TestVectorRangeInsertSimpleDeviceInstance;
@@ -171,8 +173,10 @@ VariableUnitTest<TestVectorRangeInsert, IntegralTypes> TestVectorRangeInsertInst
 template <class Vector>
 struct TestVectorFillInsertSimple
 {
-    void operator()(size_t dummy)
+    void operator()(size_t)
     {
+        typedef typename Vector::value_type T;
+
         // test when insertion range fits inside capacity
         // and the size of the insertion is greater than the number
         // of displaced elements
@@ -189,19 +193,19 @@ struct TestVectorFillInsertSimple
 
         v1.insert(v1.begin() + 1, insertion_size, 13);
 
-        ASSERT_EQUAL(0, v1[0]);
+        ASSERT_EQUAL(T(0), v1[0]);
 
-        ASSERT_EQUAL(13, v1[1]);
-        ASSERT_EQUAL(13, v1[2]);
-        ASSERT_EQUAL(13, v1[3]);
-        ASSERT_EQUAL(13, v1[4]);
-        ASSERT_EQUAL(13, v1[5]);
+        ASSERT_EQUAL(T(13), v1[1]);
+        ASSERT_EQUAL(T(13), v1[2]);
+        ASSERT_EQUAL(T(13), v1[3]);
+        ASSERT_EQUAL(T(13), v1[4]);
+        ASSERT_EQUAL(T(13), v1[5]);
 
-        ASSERT_EQUAL(1, v1[6]);
-        ASSERT_EQUAL(2, v1[7]);
+        ASSERT_EQUAL(T(1), v1[6]);
+        ASSERT_EQUAL(T(2), v1[7]);
         
-        ASSERT_EQUAL(8,  v1.size());
-        ASSERT_EQUAL(10, v1.capacity());
+        ASSERT_EQUAL(8lu,  v1.size());
+        ASSERT_EQUAL(10lu, v1.capacity());
 
         // test when insertion range fits inside capacity
         // and the size of the insertion is equal to the number
@@ -219,20 +223,20 @@ struct TestVectorFillInsertSimple
 
         v2.insert(v2.begin(), insertion_size, 13);
 
-        ASSERT_EQUAL(13, v2[0]);
-        ASSERT_EQUAL(13, v2[1]);
-        ASSERT_EQUAL(13, v2[2]);
-        ASSERT_EQUAL(13, v2[3]);
-        ASSERT_EQUAL(13, v2[4]);
+        ASSERT_EQUAL(T(13), v2[0]);
+        ASSERT_EQUAL(T(13), v2[1]);
+        ASSERT_EQUAL(T(13), v2[2]);
+        ASSERT_EQUAL(T(13), v2[3]);
+        ASSERT_EQUAL(T(13), v2[4]);
 
-        ASSERT_EQUAL(0, v2[5]);
-        ASSERT_EQUAL(1, v2[6]);
-        ASSERT_EQUAL(2, v2[7]);
-        ASSERT_EQUAL(3, v2[8]);
-        ASSERT_EQUAL(4, v2[9]);
+        ASSERT_EQUAL(T(0), v2[5]);
+        ASSERT_EQUAL(T(1), v2[6]);
+        ASSERT_EQUAL(T(2), v2[7]);
+        ASSERT_EQUAL(T(3), v2[8]);
+        ASSERT_EQUAL(T(4), v2[9]);
 
-        ASSERT_EQUAL(10, v2.size());
-        ASSERT_EQUAL(10, v2.capacity());
+        ASSERT_EQUAL(10lu, v2.size());
+        ASSERT_EQUAL(10lu, v2.capacity());
 
         // test when insertion range fits inside capacity
         // and the size of the insertion is less than the
@@ -250,19 +254,19 @@ struct TestVectorFillInsertSimple
 
         v3.insert(v3.begin() + 1, insertion_size, 13);
 
-        ASSERT_EQUAL(0, v3[0]);
+        ASSERT_EQUAL(T(0), v3[0]);
 
-        ASSERT_EQUAL(13, v3[1]);
-        ASSERT_EQUAL(13, v3[2]);
-        ASSERT_EQUAL(13, v3[3]);
+        ASSERT_EQUAL(T(13), v3[1]);
+        ASSERT_EQUAL(T(13), v3[2]);
+        ASSERT_EQUAL(T(13), v3[3]);
 
-        ASSERT_EQUAL(1, v3[4]);
-        ASSERT_EQUAL(2, v3[5]);
-        ASSERT_EQUAL(3, v3[6]);
-        ASSERT_EQUAL(4, v3[7]);
+        ASSERT_EQUAL(T(1), v3[4]);
+        ASSERT_EQUAL(T(2), v3[5]);
+        ASSERT_EQUAL(T(3), v3[6]);
+        ASSERT_EQUAL(T(4), v3[7]);
 
-        ASSERT_EQUAL(8, v3.size());
-        ASSERT_EQUAL(10, v3.capacity());
+        ASSERT_EQUAL(8lu, v3.size());
+        ASSERT_EQUAL(10lu, v3.capacity());
 
         // test when insertion range does not fit inside capacity
         Vector v4(5);
@@ -275,20 +279,20 @@ struct TestVectorFillInsertSimple
 
         v4.insert(v4.begin() + 1, insertion_size, 13);
 
-        ASSERT_EQUAL(0, v4[0]);
+        ASSERT_EQUAL(T(0), v4[0]);
 
-        ASSERT_EQUAL(13, v4[1]);
-        ASSERT_EQUAL(13, v4[2]);
-        ASSERT_EQUAL(13, v4[3]);
-        ASSERT_EQUAL(13, v4[4]);
-        ASSERT_EQUAL(13, v4[5]);
+        ASSERT_EQUAL(T(13), v4[1]);
+        ASSERT_EQUAL(T(13), v4[2]);
+        ASSERT_EQUAL(T(13), v4[3]);
+        ASSERT_EQUAL(T(13), v4[4]);
+        ASSERT_EQUAL(T(13), v4[5]);
 
-        ASSERT_EQUAL(1, v4[6]);
-        ASSERT_EQUAL(2, v4[7]);
-        ASSERT_EQUAL(3, v4[8]);
-        ASSERT_EQUAL(4, v4[9]);
+        ASSERT_EQUAL(T(1), v4[6]);
+        ASSERT_EQUAL(T(2), v4[7]);
+        ASSERT_EQUAL(T(3), v4[8]);
+        ASSERT_EQUAL(T(4), v4[9]);
 
-        ASSERT_EQUAL(10, v4.size());
+        ASSERT_EQUAL(10lu, v4.size());
     }
 }; // end TestVectorFillInsertSimple
 VectorUnitTest<TestVectorFillInsertSimple, NumericTypes, thrust::device_vector, thrust::device_malloc_allocator> TestVectorFillInsertSimpleDeviceInstance;
diff --git a/testing/vector_manipulation.cu b/testing/vector_manipulation.cu
index 440e9695e..a949b154e 100644
--- a/testing/vector_manipulation.cu
+++ b/testing/vector_manipulation.cu
@@ -13,10 +13,10 @@ void TestVectorManipulation(size_t n)
 
     // basic initialization
     Vector test0(n);
-    Vector test1(n, (T) 3);
+    Vector test1(n, T(3));
     ASSERT_EQUAL(test0.size(), n);
     ASSERT_EQUAL(test1.size(), n);
-    ASSERT_EQUAL((test1 == std::vector<T>(n, (T) 3)), true);
+    ASSERT_EQUAL((test1 == std::vector<T>(n, T(3))), true);
 
 #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
     // XXX MSVC 2005's STL unintentionally uses adl to dispatch advance which
@@ -41,9 +41,9 @@ void TestVectorManipulation(size_t n)
     ASSERT_EQUAL(vec1.size(), n);
     ASSERT_EQUAL(vec1, src); 
     
-    vec1.resize(n + 20, (T) 11);
+    vec1.resize(n + 20, T(11));
     Vector tail(vec1.begin() + n, vec1.end());
-    ASSERT_EQUAL( (tail == std::vector<T>(20, (T) 11)), true);
+    ASSERT_EQUAL((tail == std::vector<T>(20, T(11))), true);
 
     // shrinking a vector should not invalidate iterators
     Iterator first = vec1.begin();
@@ -51,36 +51,36 @@ void TestVectorManipulation(size_t n)
     ASSERT_EQUAL_QUIET(first, vec1.begin());
 
     vec1.resize(0);
-    ASSERT_EQUAL(vec1.size(), 0);
+    ASSERT_EQUAL(vec1.size(), 0lu);
     ASSERT_EQUAL(vec1.empty(), true);
     vec1.resize(10);
-    ASSERT_EQUAL(vec1.size(), 10);
+    ASSERT_EQUAL(vec1.size(), 10lu);
     vec1.clear();
-    ASSERT_EQUAL(vec1.size(), 0);
+    ASSERT_EQUAL(vec1.size(), 0lu);
     vec1.resize(5);
-    ASSERT_EQUAL(vec1.size(), 5);
+    ASSERT_EQUAL(vec1.size(), 5lu);
 
     // push_back
     Vector vec2;
     for(size_t i = 0; i < 10; ++i)
     {
         ASSERT_EQUAL(vec2.size(), i);
-        vec2.push_back( (T) i );
+        vec2.push_back(T(i));
         ASSERT_EQUAL(vec2.size(), i + 1);
         for(size_t j = 0; j <= i; j++)
-            ASSERT_EQUAL(vec2[j],     j);
-        ASSERT_EQUAL(vec2.back(), i);
+            ASSERT_EQUAL(vec2[j], T(j));
+        ASSERT_EQUAL(vec2.back(), T(i));
     }
 
     // pop_back
     for(size_t i = 10; i > 0; --i)
     {
         ASSERT_EQUAL(vec2.size(), i);
-        ASSERT_EQUAL(vec2.back(), i-1);
+        ASSERT_EQUAL(vec2.back(), T(i - 1));
         vec2.pop_back();
-        ASSERT_EQUAL(vec2.size(), i-1);
+        ASSERT_EQUAL(vec2.size(), i - 1);
         for(size_t j = 0; j < i; j++)
-            ASSERT_EQUAL(vec2[j], j);
+            ASSERT_EQUAL(vec2[j], T(j));
     }
 
     //TODO test swap, erase(pos), erase(begin, end)
diff --git a/testing/zip_function.cu b/testing/zip_function.cu
new file mode 100644
index 000000000..a1545a1a1
--- /dev/null
+++ b/testing/zip_function.cu
@@ -0,0 +1,70 @@
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <unittest/unittest.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/transform.h>
+#include <thrust/zip_function.h>
+
+#include <iostream>
+
+using namespace unittest;
+
+struct SumThree
+{
+  template <typename T1, typename T2, typename T3>
+  __host__ __device__
+  auto operator()(T1 x, T2 y, T3 z) const
+  THRUST_DECLTYPE_RETURNS(x + y + z)
+}; // end SumThree
+
+struct SumThreeTuple
+{
+  template <typename Tuple>
+  __host__ __device__
+  auto operator()(Tuple x) const
+  THRUST_DECLTYPE_RETURNS(thrust::get<0>(x) + thrust::get<1>(x) + thrust::get<2>(x))
+}; // end SumThreeTuple
+
+template <typename T>
+struct TestZipFunctionTransform
+{
+  void operator()(const size_t n)
+  {
+    using namespace thrust;
+
+    host_vector<T> h_data0 = unittest::random_samples<T>(n);
+    host_vector<T> h_data1 = unittest::random_samples<T>(n);
+    host_vector<T> h_data2 = unittest::random_samples<T>(n);
+
+    device_vector<T> d_data0 = h_data0;
+    device_vector<T> d_data1 = h_data1;
+    device_vector<T> d_data2 = h_data2;
+
+    host_vector<T>   h_result_tuple(n);
+    host_vector<T>   h_result_zip(n);
+    device_vector<T> d_result_zip(n);
+
+    // Tuple base case
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_tuple.begin(),
+              SumThreeTuple{});
+    // Zip Function
+    transform(make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin(), h_data2.begin())),
+              make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end(),   h_data2.end())),
+              h_result_zip.begin(),
+              make_zip_function(SumThree{}));
+    transform(make_zip_iterator(make_tuple(d_data0.begin(), d_data1.begin(), d_data2.begin())),
+              make_zip_iterator(make_tuple(d_data0.end(),   d_data1.end(),   d_data2.end())),
+              d_result_zip.begin(),
+              make_zip_function(SumThree{}));
+
+    ASSERT_EQUAL(h_result_tuple, h_result_zip);
+    ASSERT_EQUAL(h_result_tuple, d_result_zip);
+  }
+};
+VariableUnitTest<TestZipFunctionTransform, ThirtyTwoBitTypes> TestZipFunctionTransformInstance;
+
+#endif // THRUST_CPP_DIALECT
diff --git a/testing/zip_iterator.cu b/testing/zip_iterator.cu
index c537c00e8..c48ca2170 100644
--- a/testing/zip_iterator.cu
+++ b/testing/zip_iterator.cu
@@ -148,6 +148,7 @@ template <typename T>
   {
     using namespace thrust;
 
+#if 0
     // test host types
     typedef typename host_vector<T>::iterator          Iterator1;
     typedef typename host_vector<T>::const_iterator    Iterator2;
@@ -155,10 +156,12 @@ template <typename T>
     typedef zip_iterator<IteratorTuple1> ZipIterator1;
 
     typedef typename iterator_traversal<ZipIterator1>::type zip_iterator_traversal_type1;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_traversal_type1, random_access_traversal_tag>::value) );
 
 
+#if 0
     // test device types
     typedef typename device_vector<T>::iterator        Iterator3;
     typedef typename device_vector<T>::const_iterator  Iterator4;
@@ -166,6 +169,7 @@ template <typename T>
     typedef zip_iterator<IteratorTuple2> ZipIterator2;
 
     typedef typename iterator_traversal<ZipIterator2>::type zip_iterator_traversal_type2;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_traversal_type2, thrust::random_access_traversal_tag>::value) );
   } // end operator()()
@@ -182,6 +186,7 @@ template <typename T>
 
     // XXX these assertions complain about undefined references to integral_constant<...>::value
 
+#if 0
     // test host types
     typedef typename host_vector<T>::iterator          Iterator1;
     typedef typename host_vector<T>::const_iterator    Iterator2;
@@ -189,10 +194,12 @@ template <typename T>
     typedef zip_iterator<IteratorTuple1> ZipIterator1;
 
     typedef typename iterator_system<ZipIterator1>::type zip_iterator_system_type1;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_same<zip_iterator_system_type1, experimental::space::host>::value) );
 
 
+#if 0
     // test device types
     typedef typename device_vector<T>::iterator        Iterator3;
     typedef typename device_vector<T>::const_iterator  Iterator4;
@@ -200,10 +207,12 @@ template <typename T>
     typedef zip_iterator<IteratorTuple1> ZipIterator2;
 
     typedef typename iterator_system<ZipIterator2>::type zip_iterator_system_type2;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type2, experimental::space::device>::value) );
 
 
+#if 0
     // test any
     typedef counting_iterator<T>         Iterator5;
     typedef counting_iterator<const T>   Iterator6;
@@ -211,42 +220,51 @@ template <typename T>
     typedef zip_iterator<IteratorTuple3> ZipIterator3;
 
     typedef typename iterator_system<ZipIterator3>::type zip_iterator_system_type3;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type3, thrust::experimental::space::any>::value) );
 
     
+#if 0
     // test host/any
     typedef tuple<Iterator1, Iterator5>                IteratorTuple4;
     typedef zip_iterator<IteratorTuple4> ZipIterator4;
 
     typedef typename iterator_system<ZipIterator4>::type zip_iterator_system_type4;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type4, thrust::host_system_tag>::value) );
 
 
+#if 0
     // test any/host
     typedef tuple<Iterator5, Iterator1>                IteratorTuple5;
     typedef zip_iterator<IteratorTuple5> ZipIterator5;
 
     typedef typename iterator_system<ZipIterator5>::type zip_iterator_system_type5;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type5, thrust::host_system_tag>::value) );
 
 
+#if 0
     // test device/any
     typedef tuple<Iterator3, Iterator5>                IteratorTuple6;
     typedef zip_iterator<IteratorTuple6> ZipIterator6;
 
     typedef typename iterator_system<ZipIterator6>::type zip_iterator_system_type6;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type6, thrust::device_system_tag>::value) );
 
 
+#if 0
     // test any/device
     typedef tuple<Iterator5, Iterator3>                IteratorTuple7;
     typedef zip_iterator<IteratorTuple7> ZipIterator7;
 
     typedef typename iterator_system<ZipIterator7>::type zip_iterator_system_type7;
+#endif
 
     //ASSERT_EQUAL(true, (detail::is_convertible<zip_iterator_system_type7, thrust::device_system_tag>::value) );
   } // end operator()()
@@ -258,13 +276,14 @@ template <typename Vector>
 void TestZipIteratorCopy(void)
 {
   using namespace thrust;
+  using T = typename Vector::value_type;
 
   Vector input0(4),  input1(4);
   Vector output0(4), output1(4);
 
   // initialize input
-  sequence(input0.begin(), input0.end(),  0);
-  sequence(input1.begin(), input1.end(), 13);
+  sequence(input0.begin(), input0.end(), T{0});
+  sequence(input1.begin(), input1.end(), T{13});
 
   copy( make_zip_iterator(make_tuple(input0.begin(),  input1.begin())),
         make_zip_iterator(make_tuple(input0.end(),    input1.end())),
diff --git a/testing/zip_iterator_reduce_by_key.cu b/testing/zip_iterator_reduce_by_key.cu
index d6f931a3c..e3fc99d66 100644
--- a/testing/zip_iterator_reduce_by_key.cu
+++ b/testing/zip_iterator_reduce_by_key.cu
@@ -3,7 +3,7 @@
 #include <thrust/reduce.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include <backend/cuda/testframework.h>
+#include <unittest/cuda/testframework.h>
 #endif
 
 using namespace unittest;
diff --git a/testing/zip_iterator_scan.cu b/testing/zip_iterator_scan.cu
index f7bd5862d..96ace6d76 100644
--- a/testing/zip_iterator_scan.cu
+++ b/testing/zip_iterator_scan.cu
@@ -3,7 +3,7 @@
 #include <thrust/scan.h>
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-#include <backend/cuda/testframework.h>
+#include <unittest/cuda/testframework.h>
 #endif
 
 using namespace unittest;
@@ -40,18 +40,6 @@ struct TestZipIteratorScan
     host_vector<Tuple>   h_result(n);
     device_vector<Tuple> d_result(n);
 
-    // The tests below get miscompiled on Tesla hw for 8b types
-
-#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
-    if(const CUDATestDriver *driver = dynamic_cast<const CUDATestDriver*>(&UnitTestDriver::s_driver()))
-    {
-      if(sizeof(T) == sizeof(unittest::uint8_t) && driver->current_device_architecture() < 200)
-      {
-        KNOWN_FAILURE;
-      } // end if
-    } // end if
-#endif
-
     // inclusive_scan (tuple output)
     inclusive_scan( make_zip_iterator(make_tuple(h_data0.begin(), h_data1.begin())),
                     make_zip_iterator(make_tuple(h_data0.end(),   h_data1.end())),
diff --git a/thrust/addressof.h b/thrust/addressof.h
new file mode 100644
index 000000000..d21df0c76
--- /dev/null
+++ b/thrust/addressof.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  include <thrust/detail/memory_wrapper.h>
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! Obtains the actual address of the object or function arg, even in presence of overloaded operator&.
+ */
+template <typename T>
+__host__ __device__
+T* addressof(T& arg) 
+{
+  return reinterpret_cast<T*>(
+    &const_cast<char&>(reinterpret_cast<const volatile char&>(arg))
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+THRUST_NAMESPACE_END
diff --git a/thrust/adjacent_difference.h b/thrust/adjacent_difference.h
index 3e3d9b7c7..e8385c240 100644
--- a/thrust/adjacent_difference.h
+++ b/thrust/adjacent_difference.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations Transformations
  *  \{
@@ -51,11 +49,11 @@ namespace thrust
  *  \return The iterator <tt>result + (last - first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
  *          useful for computing differences "in place".
@@ -77,7 +75,7 @@ namespace thrust
  *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
@@ -105,10 +103,10 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  \return The iterator <tt>result + (last - first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
@@ -129,10 +127,10 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *
  *  thrust::adjacent_difference(thrust::device, d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
  *
- *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
@@ -156,11 +154,11 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  \param result The beginning of the output range.
  *  \return The iterator <tt>result + (last - first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \c x and \c y are objects of \p InputIterator's \c value_type, then \c x - \c is defined,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and the return type of <tt>x - y</tt> is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
  *          useful for computing differences "in place".
@@ -181,7 +179,7 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
  *  // d_result is now [1, 1, -1, 1, -1, 1, -1, 1]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template <typename InputIterator, typename OutputIterator>
@@ -203,10 +201,10 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
  *  \param binary_op The binary function used to compute differences.
  *  \return The iterator <tt>result + (last - first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam BinaryFunction's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  \remark Note that \p result is permitted to be the same iterator as \p first. This is
@@ -226,10 +224,10 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
  *
  *  thrust::adjacent_difference(d_data.begin(), d_data.end(), d_result.begin(), thrust::plus<int>());
  *
- *  // d_data is now [1, 3, 3, 3, 3, 3, 3, 3]
+ *  // d_result is now [1, 3, 3, 3, 3, 3, 3, 3]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/adjacent_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/adjacent_difference
  *  \see inclusive_scan
  */
 template <typename InputIterator, typename OutputIterator, typename BinaryFunction>
@@ -240,7 +238,7 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
 /*! \}
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/adjacent_difference.inl>
 
diff --git a/thrust/advance.h b/thrust/advance.h
index ba809cc0d..a5162e203 100644
--- a/thrust/advance.h
+++ b/thrust/advance.h
@@ -23,15 +23,13 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
  */
 
-/*! \p advance(i, n) increments the iterator \p i by the distance \p n. 
+/*! \p advance(i, n) increments the iterator \p i by the distance \p n.
  *  If <tt>n > 0</tt> it is equivalent to executing <tt>++i</tt> \p n
  *  times, and if <tt>n < 0</tt> it is equivalent to executing <tt>--i</tt>
  *  \p n times. If <tt>n == 0</tt>, the call has no effect.
@@ -39,8 +37,8 @@ namespace thrust
  *  \param i The iterator to be advanced.
  *  \param n The distance by which to advance the iterator.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type. 
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Distance is an integral type that is convertible to \p InputIterator's distance type.
  *
  *  \pre \p n shall be negative only for bidirectional and random access iterators.
  *
@@ -59,16 +57,84 @@ namespace thrust
  *  // iter - vec.begin() == 7
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/advance.html
+ *  \see https://en.cppreference.com/w/cpp/iterator/advance
  */
 template <typename InputIterator, typename Distance>
 __host__ __device__
 void advance(InputIterator& i, Distance n);
 
+/*! \p next(i, n) returns the \p n th successor of the iterator \p i.
+ *
+ *  \param i An iterator.
+ *  \param n The number of elements to advance.
+ *
+ *  \tparam InputIterator must meet the <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">InputIterator</a>.
+ *
+ *  \pre \p n shall be negative only for bidirectional and random access iterators.
+ *
+ *  The following code snippet demonstrates how to use \p next.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator i0 = vec.begin();
+ *
+ *  auto i1 = thrust::next(i0);
+ *
+ *  // i0 - vec.begin() == 0
+ *  // i1 - vec.begin() == 1
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/iterator/next
+ */
+#if 0 // Doxygen only
+template <typename InputIterator, typename Distance>
+__host__ __device__
+InputIterator next(
+  InputIterator i
+, typename iterator_traits<InputIterator>::difference_type n = 1
+);
+#endif
+
+/*! \p prev(i, n) returns the \p n th predecessor of the iterator \p i.
+ *
+ *  \param i An iterator.
+ *  \param n The number of elements to descend.
+ *
+ *  \tparam BidirectionalIterator must meet the <a href="https://en.cppreference.com/w/cpp/named_req/BidirectionalIterator">BidirectionalIterator</a>.
+ *
+ *  The following code snippet demonstrates how to use \p prev.
+ *
+ *  \code
+ *  #include <thrust/advance.h>
+ *  #include <thrust/device_vector.h>
+ *  ...
+ *  thrust::device_vector<int> vec(13);
+ *  thrust::device_vector<int>::iterator i0 = vec.end();
+ *
+ *  auto i1 = thrust::prev(i0);
+ *
+ *  // vec.end() - i0 == 0
+ *  // vec.end() - i1 == 1
+ *  \endcode
+ *
+ *  \see https://en.cppreference.com/w/cpp/iterator/prev
+ */
+#if 0 // Doxygen only
+template <typename BidirectionalIterator, typename Distance>
+__host__ __device__
+BidirectionalIterator prev(
+  BidirectionalIterator i
+, typename iterator_traits<BidirectionalIterator>::difference_type n = 1
+);
+#endif
+
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/advance.inl>
 
diff --git a/thrust/allocate_unique.h b/thrust/allocate_unique.h
new file mode 100644
index 000000000..ff10cb51c
--- /dev/null
+++ b/thrust/allocate_unique.h
@@ -0,0 +1,443 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/detail/memory_algorithms.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+
+#include <utility>
+#include <thrust/detail/memory_wrapper.h>
+
+THRUST_NAMESPACE_BEGIN
+
+// wg21.link/p0316r0
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail
+{
+
+template <typename Allocator, typename Pointer>
+void allocator_delete_impl(
+  Allocator const& alloc, Pointer p, std::false_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::destroy(alloc_T, thrust::raw_pointer_cast(p));
+    traits::deallocate(alloc_T, p, 1);
+  }
+}
+
+template <typename Allocator, typename Pointer>
+void allocator_delete_impl(
+  Allocator const& alloc, Pointer p, std::true_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::deallocate(alloc_T, p, 1);
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Allocator, bool Uninitialized = false>
+struct allocator_delete final
+{
+  using allocator_type
+    = typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type::template rebind<T>::other;
+  using pointer = typename detail::allocator_traits<allocator_type>::pointer;
+
+  template <typename UAllocator>
+  allocator_delete(UAllocator&& other) noexcept
+    : alloc_(THRUST_FWD(other))
+  {}
+
+  template <typename U, typename UAllocator>
+  allocator_delete(
+      allocator_delete<U, UAllocator> const& other
+    ) noexcept
+    : alloc_(other.get_allocator())
+  {}
+  template <typename U, typename UAllocator>
+  allocator_delete(
+      allocator_delete<U, UAllocator>&& other
+    ) noexcept
+    : alloc_(std::move(other.get_allocator()))
+  {}
+
+  template <typename U, typename UAllocator>
+  allocator_delete& operator=(
+    allocator_delete<U, UAllocator> const& other
+  ) noexcept
+  {
+    alloc_ = other.get_allocator();
+    return *this;
+  }
+  template <typename U, typename UAllocator>
+  allocator_delete& operator=(
+    allocator_delete<U, UAllocator>&& other
+  ) noexcept
+  {
+    alloc_ = std::move(other.get_allocator());
+    return *this;
+  }
+
+  void operator()(pointer p)
+  {
+    std::integral_constant<bool, Uninitialized> ic;
+
+    detail::allocator_delete_impl(get_allocator(), p, ic);
+  }
+
+  allocator_type& get_allocator() noexcept { return alloc_; }
+  allocator_type const& get_allocator() const noexcept { return alloc_; }
+
+  void swap(allocator_delete& other) noexcept
+  {
+    using std::swap;
+    swap(alloc_, other.alloc_);
+  }
+
+private:
+  allocator_type alloc_;
+};
+
+template <typename T, typename Allocator>
+using uninitialized_allocator_delete = allocator_delete<T, Allocator, true>;
+
+namespace detail {
+
+template <typename Allocator, typename Pointer, typename Size>
+void array_allocator_delete_impl(
+  Allocator const& alloc, Pointer p, Size count, std::false_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    destroy_n(alloc_T, p, count);
+    traits::deallocate(alloc_T, p, count);
+  }
+}
+
+template <typename Allocator, typename Pointer, typename Size>
+void array_allocator_delete_impl(
+  Allocator const& alloc, Pointer p, Size count, std::true_type
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  if (nullptr != pointer_traits<Pointer>::get(p))
+  {
+    traits::deallocate(alloc_T, p, count);
+  }
+}
+
+} // namespace detail
+
+template <typename T, typename Allocator, bool Uninitialized = false>
+struct array_allocator_delete final
+{
+  using allocator_type
+    = typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type::template rebind<T>::other;
+  using pointer = typename detail::allocator_traits<allocator_type>::pointer;
+
+  template <typename UAllocator>
+  array_allocator_delete(UAllocator&& other, std::size_t n) noexcept
+    : alloc_(THRUST_FWD(other)), count_(n)
+  {}
+
+  template <typename U, typename UAllocator>
+  array_allocator_delete(
+      array_allocator_delete<U, UAllocator> const& other
+    ) noexcept
+    : alloc_(other.get_allocator()), count_(other.count_)
+  {}
+  template <typename U, typename UAllocator>
+  array_allocator_delete(
+      array_allocator_delete<U, UAllocator>&& other
+    ) noexcept
+    : alloc_(std::move(other.get_allocator())), count_(other.count_)
+  {}
+
+  template <typename U, typename UAllocator>
+  array_allocator_delete& operator=(
+    array_allocator_delete<U, UAllocator> const& other
+  ) noexcept
+  {
+    alloc_ = other.get_allocator();
+    count_ = other.count_;
+    return *this;
+  }
+  template <typename U, typename UAllocator>
+  array_allocator_delete& operator=(
+    array_allocator_delete<U, UAllocator>&& other
+  ) noexcept
+  {
+    alloc_ = std::move(other.get_allocator());
+    count_ = other.count_;
+    return *this;
+  }
+
+  void operator()(pointer p)
+  {
+    std::integral_constant<bool, Uninitialized> ic;
+
+    detail::array_allocator_delete_impl(get_allocator(), p, count_, ic);
+  }
+
+  allocator_type& get_allocator() noexcept { return alloc_; }
+  allocator_type const& get_allocator() const noexcept { return alloc_; }
+
+  void swap(array_allocator_delete& other) noexcept
+  {
+    using std::swap;
+    swap(alloc_, other.alloc_);
+    swap(count_, other.count_);
+  }
+
+private:
+  allocator_type alloc_;
+  std::size_t    count_;
+};
+  
+template <typename T, typename Allocator>
+using uninitialized_array_allocator_delete
+  = array_allocator_delete<T, Allocator, true>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pointer, typename Lambda>
+struct tagged_deleter : Lambda
+{
+  __host__ __device__
+  tagged_deleter(Lambda&& l) : Lambda(THRUST_FWD(l)) {}
+
+  using pointer = Pointer;
+};
+
+template <typename Pointer, typename Lambda>
+__host__ __device__
+tagged_deleter<Pointer, Lambda>
+make_tagged_deleter(Lambda&& l)
+{
+  return tagged_deleter<Pointer, Lambda>(THRUST_FWD(l));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Allocator, typename... Args>
+__host__
+std::unique_ptr<
+  T,
+  allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+allocate_unique(
+  Allocator const& alloc, Args&&... args
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [&alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, 1);
+    }
+  );
+  using hold_t = std::unique_ptr<T, decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, 1), hold_deleter);
+
+  traits::construct(
+    alloc_T, thrust::raw_pointer_cast(hold.get()), THRUST_FWD(args)...
+  );
+  auto deleter = allocator_delete<T, typename traits::allocator_type>(alloc);
+  return std::unique_ptr<T, decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator>
+__host__
+std::unique_ptr<
+  T,
+  uninitialized_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+uninitialized_allocate_unique(
+  Allocator const& alloc
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [&alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, 1);
+    }
+  );
+  using hold_t = std::unique_ptr<T, decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, 1), hold_deleter);
+
+  auto deleter = uninitialized_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T);
+  return std::unique_ptr<T, decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator, typename Size, typename... Args>
+__host__
+std::unique_ptr<
+  T[],
+  array_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+allocate_unique_n(
+  Allocator const& alloc, Size n, Args&&... args
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [n, &alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, n);
+    }
+  );
+  using hold_t = std::unique_ptr<T[], decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, n), hold_deleter);
+
+  uninitialized_construct_n_with_allocator(
+    alloc_T, hold.get(), n, THRUST_FWD(args)...
+  );
+  auto deleter = array_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T, n);
+  return std::unique_ptr<T[], decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+template <typename T, typename Allocator, typename Size>
+__host__
+std::unique_ptr<
+  T[],
+  uninitialized_array_allocator_delete<
+    T
+  , typename detail::allocator_traits<
+      typename std::remove_cv<
+        typename std::remove_reference<Allocator>::type
+      >::type
+    >::template rebind_traits<T>::allocator_type
+  >
+>
+uninitialized_allocate_unique_n(
+  Allocator const& alloc, Size n
+)
+{
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  auto hold_deleter = make_tagged_deleter<typename traits::pointer>(
+    [n, &alloc_T] (typename traits::pointer p) {
+      traits::deallocate(alloc_T, p, n);
+    }
+  );
+  using hold_t = std::unique_ptr<T[], decltype(hold_deleter)>;
+  auto hold = hold_t(traits::allocate(alloc_T, n), hold_deleter);
+
+  auto deleter = uninitialized_array_allocator_delete<
+    T, typename traits::allocator_type
+  >(alloc_T, n);
+  return std::unique_ptr<T[], decltype(deleter)>
+    (hold.release(), std::move(deleter));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/async/copy.h b/thrust/async/copy.h
new file mode 100644
index 000000000..a8edc7411
--- /dev/null
+++ b/thrust/async/copy.h
@@ -0,0 +1,154 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Algorithms for asynchronously copying a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/copy.h>
+
+#include <thrust/event.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+/*! \cond
+ */
+
+namespace unimplemented
+{
+
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+__host__
+event<FromPolicy>
+async_copy(
+  thrust::execution_policy<FromPolicy>& from_exec
+, thrust::execution_policy<ToPolicy>&   to_exec
+, ForwardIt first, Sentinel last, OutputIt output
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace copy_detail
+{
+
+using thrust::async::unimplemented::async_copy;
+
+struct copy_fn final
+{
+  template <
+    typename FromPolicy, typename ToPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<FromPolicy> const& from_exec
+  , thrust::detail::execution_policy_base<ToPolicy> const&   to_exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_copy(
+      thrust::detail::derived_cast(thrust::detail::strip_const(from_exec))
+    , thrust::detail::derived_cast(thrust::detail::strip_const(to_exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  THRUST_RETURNS(
+    copy_fn::call(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+      // Synthesize a suitable new execution policy, because we don't want to
+      // try and extract twice from the one we were passed.
+    , typename remove_cvref_t<
+        decltype(thrust::detail::derived_cast(thrust::detail::strip_const(exec)))
+      >::tag_type{}
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, OutputIt&& output)
+  THRUST_RETURNS(
+    copy_fn::call(
+      thrust::detail::select_system(
+        typename thrust::iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , thrust::detail::select_system(
+        typename thrust::iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace copy_detail
+
+THRUST_INLINE_CONSTANT copy_detail::copy_fn copy{};
+
+/*! \endcond
+ */
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
+
diff --git a/thrust/async/for_each.h b/thrust/async/for_each.h
new file mode 100644
index 000000000..0d3b3a189
--- /dev/null
+++ b/thrust/async/for_each.h
@@ -0,0 +1,123 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Algorithms for asynchronously iterating over the elements of a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/for_each.h>
+
+#include <thrust/event.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+/*! \cond
+ */
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename UnaryFunction
+>
+__host__
+event<DerivedPolicy>
+async_for_each(
+  thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, UnaryFunction
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace for_each_detail
+{
+
+using thrust::async::unimplemented::async_for_each;
+
+struct for_each_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename UnaryFunction
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , UnaryFunction&& f
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_for_each(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(f)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename UnaryFunction>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, UnaryFunction&& f)
+  THRUST_RETURNS(
+    for_each_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(f)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace for_each_detail
+
+THRUST_INLINE_CONSTANT for_each_detail::for_each_fn for_each{};
+
+/*! \endcond
+ */
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/async/reduce.h b/thrust/async/reduce.h
new file mode 100644
index 000000000..8f4fe3133
--- /dev/null
+++ b/thrust/async/reduce.h
@@ -0,0 +1,446 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Algorithms for asynchronously reducing a range to a single value.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/system/detail/adl/async/reduce.h>
+
+#include <thrust/future.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+/*! \cond
+ */
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+>
+__host__
+future<DerivedPolicy, T>
+async_reduce(
+  thrust::execution_policy<DerivedPolicy>&, ForwardIt, Sentinel, T, BinaryOp
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace reduce_detail
+{
+
+using thrust::async::unimplemented::async_reduce;
+
+struct reduce_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , T&& init
+  , BinaryOp&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename T
+  >
+  __host__
+  static auto call4(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , T&& init
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__
+  static auto
+  call3(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename T, typename BinaryOp>
+  __host__
+  static auto call4(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    BinaryOp&& op,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename T>
+  __host__
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    T&& init,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    reduce_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_fn::call4(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+                     thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
+    reduce_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace reduce_detail
+
+THRUST_INLINE_CONSTANT reduce_detail::reduce_fn reduce{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename T, typename BinaryOp
+>
+__host__
+event<DerivedPolicy>
+async_reduce_into(
+  thrust::execution_policy<DerivedPolicy>&
+, ForwardIt, Sentinel, OutputIt, T, BinaryOp
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace reduce_into_detail
+{
+
+using thrust::async::unimplemented::async_reduce_into;
+
+struct reduce_into_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T, typename BinaryOp
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , BinaryOp&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T
+  >
+  __host__
+  static auto call5(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto
+  call4(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , thrust::true_type
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_reduce_into(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T, typename BinaryOp
+  >
+  __host__
+  static auto call5(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , BinaryOp&& op
+  , thrust::false_type
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename T
+  >
+  __host__
+  static auto call4(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , T&& init
+  , thrust::false_type
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(init)
+    , thrust::plus<remove_cvref_t<T>>{}
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  >
+  __host__
+  static auto call(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  )
+  THRUST_RETURNS(
+    reduce_into_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type{}
+    , thrust::plus<
+        remove_cvref_t<
+          typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+        >
+      >{}
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3, typename T4>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4)
+  THRUST_RETURNS(
+    reduce_into_fn::call4(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename T1, typename T2, typename T3, typename T4, typename T5>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3, T4&& t4, T5&& t5)
+  THRUST_RETURNS(
+    reduce_into_fn::call5(
+      THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3), THRUST_FWD(t4),
+      THRUST_FWD(t5), thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace reduce_into_detail
+
+THRUST_INLINE_CONSTANT reduce_into_detail::reduce_into_fn reduce_into{};
+
+/*! \endcond
+ */
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
+
diff --git a/thrust/async/scan.h b/thrust/async/scan.h
new file mode 100644
index 000000000..1bcf81257
--- /dev/null
+++ b/thrust/async/scan.h
@@ -0,0 +1,344 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file async/scan.h
+ *  \brief Functions for asynchronously computing prefix scans.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/detail/static_assert.h>
+
+#include <thrust/system/detail/adl/async/scan.h>
+
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/future.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+// Fallback implementations used when no overloads are found via ADL:
+namespace unimplemented
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_inclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+event<DerivedPolicy>
+async_exclusive_scan(thrust::execution_policy<DerivedPolicy>&,
+                     ForwardIt,
+                     Sentinel,
+                     OutputIt,
+                     InitialValueType,
+                     BinaryOp)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value),
+    "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace inclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_inclusive_scan;
+
+// Implementation of the thrust::async::inclusive_scan CPO.
+struct inclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out,
+                  BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  auto operator()(ForwardIt&& first, Sentinel&& last, OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_inclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace inclusive_scan_detail
+
+THRUST_INLINE_CONSTANT inclusive_scan_detail::inclusive_scan_fn inclusive_scan{};
+
+namespace exclusive_scan_detail
+{
+
+// Include fallback implementation for ADL failures
+using thrust::async::unimplemented::async_exclusive_scan;
+
+// Implementation of the thrust::async::exclusive_scan CPO.
+struct exclusive_scan_fn final
+{
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename DerivedPolicy,
+            typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt>
+  auto
+  operator()(thrust::detail::execution_policy_base<DerivedPolicy> const& exec,
+             ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename BinaryOp,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init,
+             BinaryOp&& op) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      THRUST_FWD(op)
+    )
+  )
+
+  template <typename ForwardIt,
+            typename Sentinel,
+            typename OutputIt,
+            typename InitialValueType,
+            typename = std::enable_if_t<!is_execution_policy_v<remove_cvref_t<ForwardIt>>>>
+  auto
+  operator()(ForwardIt&& first,
+             Sentinel&& last,
+             OutputIt&& out,
+             InitialValueType&& init) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      THRUST_FWD(init),
+      thrust::plus<>{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename OutputIt>
+  auto operator()(ForwardIt&& first,
+                  Sentinel&& last,
+                  OutputIt&& out) const
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_exclusive_scan(
+      thrust::detail::select_system(
+        iterator_system_t<remove_cvref_t<ForwardIt>>{},
+        iterator_system_t<remove_cvref_t<OutputIt>>{}
+      ),
+      THRUST_FWD(first),
+      THRUST_FWD(last),
+      THRUST_FWD(out),
+      iterator_value_t<remove_cvref_t<ForwardIt>>{},
+      thrust::plus<>{}
+    )
+  )
+};
+
+} // namespace exclusive_scan_detail
+
+THRUST_INLINE_CONSTANT exclusive_scan_detail::exclusive_scan_fn exclusive_scan{};
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/async/sort.h b/thrust/async/sort.h
new file mode 100644
index 000000000..888179397
--- /dev/null
+++ b/thrust/async/sort.h
@@ -0,0 +1,280 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Algorithms for asynchronously sorting a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/type_traits/is_execution_policy.h>
+#include <thrust/system/detail/adl/async/sort.h>
+
+#include <thrust/event.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+/*! \cond
+ */
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+__host__
+event<DerivedPolicy>
+async_stable_sort(
+  thrust::execution_policy<DerivedPolicy>&
+, ForwardIt, Sentinel, StrictWeakOrdering
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace stable_sort_detail
+{
+
+using thrust::async::unimplemented::async_stable_sort;
+
+struct stable_sort_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , StrictWeakOrdering&& comp
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_stable_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_stable_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp)
+  THRUST_RETURNS(
+    stable_sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
+    stable_sort_fn::call(
+      THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace stable_sort_detail
+
+THRUST_INLINE_CONSTANT stable_sort_detail::stable_sort_fn stable_sort{};
+
+namespace fallback
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+__host__
+event<DerivedPolicy>
+async_sort(
+  thrust::execution_policy<DerivedPolicy>& exec
+, ForwardIt&& first, Sentinel&& last, StrictWeakOrdering&& comp
+)
+{
+  return async_stable_sort(
+    thrust::detail::derived_cast(exec)
+  , THRUST_FWD(first), THRUST_FWD(last), THRUST_FWD(comp)
+  );
+}
+
+} // namespace fallback
+
+namespace sort_detail
+{
+
+using thrust::async::fallback::async_sort;
+
+struct sort_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+  >
+  __host__
+  static auto call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , StrictWeakOrdering&& comp
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_sort(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel
+  >
+  __host__
+  static auto call3(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , thrust::true_type
+  )
+  THRUST_RETURNS(
+    sort_fn::call(
+      exec
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename ForwardIt, typename Sentinel, typename StrictWeakOrdering>
+  __host__
+  static auto call3(ForwardIt&& first, Sentinel&& last,
+                    StrictWeakOrdering&& comp,
+                    thrust::false_type)
+  THRUST_RETURNS(
+    sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(comp)
+    )
+  )
+
+  // MSVC WAR: MSVC gets angsty and eats all available RAM when we try to detect
+  // if T1 is an execution_policy by using SFINAE. Switching to a static
+  // dispatch pattern to prevent this.
+  template <typename T1, typename T2, typename T3>
+  __host__
+  static auto call(T1&& t1, T2&& t2, T3&& t3)
+  THRUST_RETURNS(
+    sort_fn::call3(THRUST_FWD(t1), THRUST_FWD(t2), THRUST_FWD(t3),
+                   thrust::is_execution_policy<thrust::remove_cvref_t<T1>>{})
+  )
+
+  template <typename ForwardIt, typename Sentinel>
+  __host__
+  static auto call(ForwardIt&& first, Sentinel&& last)
+  THRUST_RETURNS(
+    sort_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , thrust::less<
+        typename iterator_traits<remove_cvref_t<ForwardIt>>::value_type
+      >{}
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace sort_detail
+
+THRUST_INLINE_CONSTANT sort_detail::sort_fn sort{};
+
+/*! \endcond
+ */
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
+
diff --git a/thrust/async/transform.h b/thrust/async/transform.h
new file mode 100644
index 000000000..de72549bf
--- /dev/null
+++ b/thrust/async/transform.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Algorithms for asynchronously transforming a range.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/select_system.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/adl/async/transform.h>
+
+#include <thrust/event.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace async
+{
+
+/*! \cond
+ */
+
+namespace unimplemented
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename UnaryOperation
+>
+__host__
+event<DerivedPolicy>
+async_transform(
+  thrust::execution_policy<DerivedPolicy>& exec
+, ForwardIt first, Sentinel last, OutputIt output, UnaryOperation op
+)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<ForwardIt, false>::value)
+  , "this algorithm is not implemented for the specified system"
+  );
+  return {};
+}
+
+} // namespace unimplemented
+
+namespace transform_detail
+{
+
+using thrust::async::unimplemented::async_transform;
+
+struct transform_fn final
+{
+  template <
+    typename DerivedPolicy
+  , typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename UnaryOperation
+  >
+  __host__
+  static auto
+  call(
+    thrust::detail::execution_policy_base<DerivedPolicy> const& exec
+  , ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , UnaryOperation&& op
+  )
+  // ADL dispatch.
+  THRUST_RETURNS(
+    async_transform(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec))
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <
+    typename ForwardIt, typename Sentinel, typename OutputIt
+  , typename UnaryOperation
+  >
+  __host__
+  static auto call(
+    ForwardIt&& first, Sentinel&& last
+  , OutputIt&& output
+  , UnaryOperation&& op
+  )
+  THRUST_RETURNS(
+    transform_fn::call(
+      thrust::detail::select_system(
+        typename iterator_system<remove_cvref_t<ForwardIt>>::type{}
+      , typename iterator_system<remove_cvref_t<OutputIt>>::type{}
+      )
+    , THRUST_FWD(first), THRUST_FWD(last)
+    , THRUST_FWD(output)
+    , THRUST_FWD(op)
+    )
+  )
+
+  template <typename... Args>
+  THRUST_NODISCARD __host__
+  auto operator()(Args&&... args) const
+  THRUST_RETURNS(
+    call(THRUST_FWD(args)...)
+  )
+};
+
+} // namespace tranform_detail
+
+THRUST_INLINE_CONSTANT transform_detail::transform_fn transform{};
+
+/*! \endcond
+ */
+
+} // namespace async
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/binary_search.h b/thrust/binary_search.h
index 127be16aa..7a4746e0b 100644
--- a/thrust/binary_search.h
+++ b/thrust/binary_search.h
@@ -25,10 +25,8 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-    
 /*! \addtogroup algorithms
  */
 
@@ -67,8 +65,8 @@ namespace thrust
  *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -94,7 +92,7 @@ namespace thrust
  *  thrust::lower_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -120,8 +118,8 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param value The value to be searched.
  *  \return The furthermost iterator \c i, such that <tt>*i < value</tt>.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range.
@@ -146,7 +144,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::lower_bound(input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -176,9 +174,9 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -205,7 +203,7 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -234,9 +232,9 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param comp The comparison operator.
  *  \return The furthermost iterator \c i, such that <tt>comp(*i, value)</tt> is \c true.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p lower_bound
  *  to search for values in a ordered range.
@@ -262,7 +260,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::lower_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -292,8 +290,8 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelism:
@@ -319,7 +317,7 @@ ForwardIterator lower_bound(ForwardIterator first,
  *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -346,8 +344,8 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param value The value to be searched.
  *  \return The furthermost iterator \c i, such that <tt>value < *i</tt> is \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range.
@@ -372,7 +370,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::upper_bound(input.begin(), input.end(), 9); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -402,9 +400,9 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -431,7 +429,7 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  thrust::upper_bound(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -459,9 +457,9 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  \param comp The comparison operator.
  *  \return The furthermost iterator \c i, such that <tt>comp(value, *i)</tt> is \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p upper_bound
  *  to search for values in a ordered range.
@@ -487,7 +485,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
  *  thrust::upper_bound(input.begin(), input.end(), 9, thrust::less<int>()); // returns input.end()
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -516,8 +514,8 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -543,7 +541,7 @@ ForwardIterator upper_bound(ForwardIterator first,
  *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -569,8 +567,8 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param value The value to be searched.
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range.
@@ -595,7 +593,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  thrust::binary_search(input.begin(), input.end(), 9); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -624,9 +622,9 @@ bool binary_search(ForwardIterator first,
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -653,7 +651,7 @@ bool binary_search(ForwardIterator first,
  *  thrust::binary_search(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -681,9 +679,9 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param comp The comparison operator.
  *  \return \c true if an equivalent element exists in <tt>[first, last)</tt>, otherwise \c false.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p binary_search
  *  to search for values in a ordered range.
@@ -709,7 +707,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  thrust::binary_search(input.begin(), input.end(), 9, thrust::less<int>()); // returns false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -751,8 +749,8 @@ bool binary_search(ForwardIterator first,
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -778,7 +776,7 @@ bool binary_search(ForwardIterator first,
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -818,8 +816,8 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  \param value The value to be searched.
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam LessThanComparable is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>. 
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam LessThanComparable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>. 
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range.
@@ -844,7 +842,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  thrust::equal_range(input.begin(), input.end(), 9); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -888,9 +886,9 @@ equal_range(ForwardIterator first,
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range using the \p thrust::device execution policy for parallelization:
@@ -917,7 +915,7 @@ equal_range(ForwardIterator first,
  *  thrust::equal_range(thrust::device, input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -960,9 +958,9 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  \param comp The comparison operator.
  *  \return A \p pair of iterators <tt>[i, j)</tt> that define the range of equivalent elements.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
  *  \tparam T is comparable to \p ForwardIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p equal_range
  *  to search for values in a ordered range.
@@ -988,7 +986,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
  *  thrust::equal_range(input.begin(), input.end(), 9, thrust::less<int>()); // returns [input.end(), input.end)
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal_range.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal_range
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p binary_search
@@ -1028,10 +1026,10 @@ equal_range(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1071,7 +1069,7 @@ equal_range(ForwardIterator first,
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1098,10 +1096,10 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1138,7 +1136,7 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1169,12 +1167,12 @@ OutputIterator lower_bound(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1213,7 +1211,7 @@ OutputIterator lower_bound(ForwardIterator first,
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1243,12 +1241,12 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1286,7 +1284,7 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [0, 1, 1, 2, 4, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/lower_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1316,10 +1314,10 @@ OutputIterator lower_bound(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1359,7 +1357,7 @@ OutputIterator lower_bound(ForwardIterator first,
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1386,10 +1384,10 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1426,7 +1424,7 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p upper_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1457,12 +1455,12 @@ OutputIterator upper_bound(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1503,7 +1501,7 @@ OutputIterator upper_bound(ForwardIterator first,
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1533,12 +1531,12 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *                        and \c InputIterator's \c value_type is comparable to \p ForwardIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and \c ForwardIterator's difference_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1576,7 +1574,7 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
  *  // output is now [1, 1, 2, 2, 5, 5]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/upper_bound.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/upper_bound
  *  \see \p lower_bound
  *  \see \p equal_range
  *  \see \p binary_search
@@ -1607,10 +1605,10 @@ OutputIterator upper_bound(ForwardIterator first,
  *  \param result The beginning of the output sequence.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1650,7 +1648,7 @@ OutputIterator upper_bound(ForwardIterator first,
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1678,10 +1676,10 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  \param values_last The end of the search values sequence.
  *  \param result The beginning of the output sequence.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -1718,7 +1716,7 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1750,12 +1748,12 @@ OutputIterator binary_search(ForwardIterator first,
  *  \param comp The comparison operator.
  * 
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1796,7 +1794,7 @@ OutputIterator binary_search(ForwardIterator first,
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1827,12 +1825,12 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  \param result The beginning of the output sequence.
  *  \param comp The comparison operator.
  * 
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *                        and \c InputIterator's \c value_type is <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThanComparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *                        and \c InputIterator's \c value_type is <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThanComparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *                        and bool is convertible to \c OutputIterator's \c value_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -1870,7 +1868,7 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
  *  // output is now [true, false, true, false, true, false]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/binary_search.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/binary_search
  *  \see \p lower_bound
  *  \see \p upper_bound
  *  \see \p equal_range
@@ -1895,8 +1893,7 @@ OutputIterator binary_search(ForwardIterator first,
 /*! \} // end searching
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/binary_search.inl>
 
diff --git a/thrust/cmake/FindTBB.cmake b/thrust/cmake/FindTBB.cmake
new file mode 100644
index 000000000..01e53d5e7
--- /dev/null
+++ b/thrust/cmake/FindTBB.cmake
@@ -0,0 +1,446 @@
+# - Find ThreadingBuildingBlocks include dirs and libraries
+# Use this module by invoking find_package with the form:
+#  find_package(TBB
+#    [REQUIRED]             # Fail with error if TBB is not found
+#    )                      #
+# Once done, this will define
+#
+#  TBB_FOUND - system has TBB
+#  TBB_INCLUDE_DIRS - the TBB include directories
+#  TBB_LIBRARIES - TBB libraries to be lined, doesn't include malloc or
+#                  malloc proxy
+#  TBB::tbb - imported target for the TBB library
+#
+#  TBB_VERSION - Product Version Number ("MAJOR.MINOR")
+#  TBB_VERSION_MAJOR - Major Product Version Number
+#  TBB_VERSION_MINOR - Minor Product Version Number
+#  TBB_INTERFACE_VERSION - Engineering Focused Version Number
+#  TBB_COMPATIBLE_INTERFACE_VERSION - The oldest major interface version
+#                                     still supported. This uses the engineering
+#                                     focused interface version numbers.
+#
+#  TBB_MALLOC_FOUND - system has TBB malloc library
+#  TBB_MALLOC_INCLUDE_DIRS - the TBB malloc include directories
+#  TBB_MALLOC_LIBRARIES - The TBB malloc libraries to be lined
+#  TBB::malloc - imported target for the TBB malloc library
+#
+#  TBB_MALLOC_PROXY_FOUND - system has TBB malloc proxy library
+#  TBB_MALLOC_PROXY_INCLUDE_DIRS = the TBB malloc proxy include directories
+#  TBB_MALLOC_PROXY_LIBRARIES - The TBB malloc proxy libraries to be lined
+#  TBB::malloc_proxy - imported target for the TBB malloc proxy library
+#
+#
+# This module reads hints about search locations from variables:
+#  ENV TBB_ARCH_PLATFORM - for eg. set it to "mic" for Xeon Phi builds
+#  ENV TBB_ROOT or just TBB_ROOT - root directory of tbb installation
+#  ENV TBB_BUILD_PREFIX - specifies the build prefix for user built tbb
+#                         libraries. Should be specified with ENV TBB_ROOT
+#                         and optionally...
+#  ENV TBB_BUILD_DIR - if build directory is different than ${TBB_ROOT}/build
+#
+#
+# Modified by Robert Maynard from the original OGRE source
+#
+#-------------------------------------------------------------------
+# This file is part of the CMake build system for OGRE
+#     (Object-oriented Graphics Rendering Engine)
+# For the latest info, see http://www.ogre3d.org/
+#
+# The contents of this file are placed in the public domain. Feel
+# free to make use of it in any way you like.
+#-------------------------------------------------------------------
+#
+#=============================================================================
+# Copyright 2010-2012 Kitware, Inc.
+# Copyright 2012      Rolf Eike Beer <eike@sf-mail.de>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+#  License text for the above reference.)
+
+
+#=============================================================================
+#  FindTBB helper functions and macros
+#
+
+#====================================================
+# Fix the library path in case it is a linker script
+#====================================================
+function(tbb_extract_real_library library real_library)
+  if(NOT UNIX OR NOT EXISTS ${library})
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  #Read in the first 4 bytes and see if they are the ELF magic number
+  set(_elf_magic "7f454c46")
+  file(READ ${library} _hex_data OFFSET 0 LIMIT 4 HEX)
+  if(_hex_data STREQUAL _elf_magic)
+    #we have opened a elf binary so this is what
+    #we should link to
+    set(${real_library} "${library}" PARENT_SCOPE)
+    return()
+  endif()
+
+  file(READ ${library} _data OFFSET 0 LIMIT 1024)
+  if("${_data}" MATCHES "INPUT \\(([^(]+)\\)")
+    #extract out the .so name from REGEX MATCH command
+    set(_proper_so_name "${CMAKE_MATCH_1}")
+
+    #construct path to the real .so which is presumed to be in the same directory
+    #as the input file
+    get_filename_component(_so_dir "${library}" DIRECTORY)
+    set(${real_library} "${_so_dir}/${_proper_so_name}" PARENT_SCOPE)
+  else()
+    #unable to determine what this library is so just hope everything works
+    #and pass it unmodified.
+    set(${real_library} "${library}" PARENT_SCOPE)
+  endif()
+endfunction()
+
+#===============================================
+# Do the final processing for the package find.
+#===============================================
+macro(findpkg_finish PREFIX TARGET_NAME)
+  if (${PREFIX}_INCLUDE_DIR AND ${PREFIX}_LIBRARY)
+    set(${PREFIX}_FOUND TRUE)
+    set (${PREFIX}_INCLUDE_DIRS ${${PREFIX}_INCLUDE_DIR})
+    set (${PREFIX}_LIBRARIES ${${PREFIX}_LIBRARY})
+  else ()
+    if (${PREFIX}_FIND_REQUIRED)
+      message(FATAL_ERROR "Required library ${PREFIX} not found.")
+    elseif (NOT ${PREFIX}_FIND_QUIETLY)
+      message("Library ${PREFIX} not found.")
+    endif()
+    return()
+  endif ()
+
+  if (NOT TARGET "TBB::${TARGET_NAME}")
+    if (${PREFIX}_LIBRARY_RELEASE)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_RELEASE} real_release)
+    endif ()
+    if (${PREFIX}_LIBRARY_DEBUG)
+      tbb_extract_real_library(${${PREFIX}_LIBRARY_DEBUG} real_debug)
+    endif ()
+    add_library(TBB::${TARGET_NAME} UNKNOWN IMPORTED)
+    set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+      INTERFACE_INCLUDE_DIRECTORIES "${${PREFIX}_INCLUDE_DIR}")
+    if (${PREFIX}_LIBRARY_DEBUG AND ${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}"
+        IMPORTED_LOCATION_DEBUG "${real_debug}"
+        IMPORTED_LOCATION_RELEASE "${real_release}")
+    elseif (${PREFIX}_LIBRARY_RELEASE)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_release}")
+    elseif (${PREFIX}_LIBRARY_DEBUG)
+      set_target_properties(TBB::${TARGET_NAME} PROPERTIES
+        IMPORTED_LOCATION "${real_debug}")
+    endif ()
+  endif ()
+
+  #mark the following variables as internal variables
+  mark_as_advanced(${PREFIX}_INCLUDE_DIR
+                   ${PREFIX}_LIBRARY
+                   ${PREFIX}_LIBRARY_DEBUG
+                   ${PREFIX}_LIBRARY_RELEASE)
+endmacro()
+
+#===============================================
+# Generate debug names from given release names
+#===============================================
+macro(get_debug_names PREFIX)
+  foreach(i ${${PREFIX}})
+    set(${PREFIX}_DEBUG ${${PREFIX}_DEBUG} ${i}d ${i}D ${i}_d ${i}_D ${i}_debug ${i})
+  endforeach()
+endmacro()
+
+#===============================================
+# See if we have env vars to help us find tbb
+#===============================================
+macro(getenv_path VAR)
+   set(ENV_${VAR} $ENV{${VAR}})
+   # replace won't work if var is blank
+   if (ENV_${VAR})
+     string( REGEX REPLACE "\\\\" "/" ENV_${VAR} ${ENV_${VAR}} )
+   endif ()
+endmacro()
+
+#===============================================
+# Couple a set of release AND debug libraries
+#===============================================
+macro(make_library_set PREFIX)
+  if (${PREFIX}_RELEASE AND ${PREFIX}_DEBUG)
+    set(${PREFIX} optimized ${${PREFIX}_RELEASE} debug ${${PREFIX}_DEBUG})
+  elseif (${PREFIX}_RELEASE)
+    set(${PREFIX} ${${PREFIX}_RELEASE})
+  elseif (${PREFIX}_DEBUG)
+    set(${PREFIX} ${${PREFIX}_DEBUG})
+  endif ()
+endmacro()
+
+
+#=============================================================================
+#  Now to actually find TBB
+#
+
+# Get path, convert backslashes as ${ENV_${var}}
+getenv_path(TBB_ROOT)
+
+# initialize search paths
+set(TBB_PREFIX_PATH ${TBB_ROOT} ${ENV_TBB_ROOT})
+set(TBB_INC_SEARCH_PATH "")
+set(TBB_LIB_SEARCH_PATH "")
+
+
+# If user built from sources
+set(TBB_BUILD_PREFIX $ENV{TBB_BUILD_PREFIX})
+if (TBB_BUILD_PREFIX AND ENV_TBB_ROOT)
+  getenv_path(TBB_BUILD_DIR)
+  if (NOT ENV_TBB_BUILD_DIR)
+    set(ENV_TBB_BUILD_DIR ${ENV_TBB_ROOT}/build)
+  endif ()
+
+  # include directory under ${ENV_TBB_ROOT}/include
+  list(APPEND TBB_LIB_SEARCH_PATH
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_release
+    ${ENV_TBB_BUILD_DIR}/${TBB_BUILD_PREFIX}_debug)
+endif ()
+
+
+# For Windows, let's assume that the user might be using the precompiled
+# TBB packages from the main website. These use a rather awkward directory
+# structure (at least for automatically finding the right files) depending
+# on platform and compiler, but we'll do our best to accommodate it.
+# Not adding the same effort for the precompiled linux builds, though. Those
+# have different versions for CC compiler versions and linux kernels which
+# will never adequately match the user's setup, so there is no feasible way
+# to detect the "best" version to use. The user will have to manually
+# select the right files. (Chances are the distributions are shipping their
+# custom version of tbb, anyway, so the problem is probably nonexistent.)
+if (WIN32 AND MSVC)
+  set(COMPILER_PREFIX "vc7.1")
+  if (MSVC_VERSION EQUAL 1400)
+    set(COMPILER_PREFIX "vc8")
+  elseif(MSVC_VERSION EQUAL 1500)
+    set(COMPILER_PREFIX "vc9")
+  elseif(MSVC_VERSION EQUAL 1600)
+    set(COMPILER_PREFIX "vc10")
+  elseif(MSVC_VERSION EQUAL 1700)
+    set(COMPILER_PREFIX "vc11")
+  elseif(MSVC_VERSION EQUAL 1800)
+    set(COMPILER_PREFIX "vc12")
+  elseif(MSVC_VERSION GREATER_EQUAL 1900 AND MSVC_VERSION LESS_EQUAL 1939)
+      # 1900-1925 actually spans three Visual Studio versions:
+      # 1900      = VS 14.0 (v140 toolset) a.k.a. MSVC 2015
+      # 1910-1919 = VS 15.0 (v141 toolset) a.k.a. MSVC 2017
+      # 1920-1929 = VS 16.0 (v142 toolset) a.k.a. MSVC 2019
+      # 1930-1939 = VS 17.0 (v143 toolset) a.k.a. MSVC 2022
+      #
+      # But these are binary compatible and TBB's open source distribution only
+      # ships a single vs14 lib (as of 2020.0)
+    set(COMPILER_PREFIX "vc14")
+  else()
+    # The next poor soul who finds themselves having to decode visual studio
+    # version conventions may find these helpful:
+    # - https://cmake.org/cmake/help/latest/variable/MSVC_VERSION.html
+    # - https://en.wikipedia.org/wiki/Microsoft_Visual_C%2B%2B#Internal_version_numbering
+    message(AUTHOR_WARNING
+      "Unrecognized MSVC version (${MSVC_VERSION}). "
+      "Please update FindTBB.cmake. "
+      "Some TBB_* CMake variables may need to be set manually."
+    )
+  endif ()
+
+  # for each prefix path, add ia32/64\${COMPILER_PREFIX}\lib to the lib search path
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    if (CMAKE_CL_64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia64/${COMPILER_PREFIX})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${COMPILER_PREFIX})
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${COMPILER_PREFIX}/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${COMPILER_PREFIX})
+    endif ()
+  endforeach ()
+endif ()
+
+# For OS X binary distribution, choose libc++ based libraries for Mavericks (10.9)
+# and above and AppleClang
+if (CMAKE_SYSTEM_NAME STREQUAL "Darwin" AND
+    NOT CMAKE_SYSTEM_VERSION VERSION_LESS 13.0)
+  set (USE_LIBCXX OFF)
+  cmake_policy(GET CMP0025 POLICY_VAR)
+
+  if (POLICY_VAR STREQUAL "NEW")
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
+      set (USE_LIBCXX ON)
+    endif ()
+  else ()
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+      set (USE_LIBCXX ON)
+    endif ()
+  endif ()
+
+  if (USE_LIBCXX)
+    foreach (dir IN LISTS TBB_PREFIX_PATH)
+      list (APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/libc++ ${dir}/libc++/lib)
+    endforeach ()
+  endif ()
+endif ()
+
+# check compiler ABI
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.7)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.4)
+    list(APPEND COMPILER_PREFIX "gcc4.4")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.1")
+elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  set(COMPILER_PREFIX)
+  if (NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 3.6)
+    list(APPEND COMPILER_PREFIX "gcc4.7")
+  endif()
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+else() # Assume compatibility with 4.4 for other compilers
+  list(APPEND COMPILER_PREFIX "gcc4.4")
+endif ()
+
+# if platform architecture is explicitly specified
+set(TBB_ARCH_PLATFORM $ENV{TBB_ARCH_PLATFORM})
+if (TBB_ARCH_PLATFORM)
+  foreach (dir IN LISTS TBB_PREFIX_PATH)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/${TBB_ARCH_PLATFORM}/lib)
+    list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/${TBB_ARCH_PLATFORM})
+  endforeach ()
+endif ()
+
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  foreach (prefix IN LISTS COMPILER_PREFIX)
+    if (CMAKE_SIZEOF_VOID_P EQUAL 8)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/intel64/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/intel64/${prefix}/lib)
+    else ()
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib/ia32/${prefix})
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/lib)
+      list(APPEND TBB_LIB_SEARCH_PATH ${dir}/ia32/${prefix}/lib)
+    endif ()
+  endforeach()
+endforeach ()
+
+# add general search paths
+foreach (dir IN LISTS TBB_PREFIX_PATH)
+  list(APPEND TBB_LIB_SEARCH_PATH ${dir}/lib ${dir}/Lib ${dir}/lib/tbb
+    ${dir}/Libs)
+  list(APPEND TBB_INC_SEARCH_PATH ${dir}/include ${dir}/Include
+    ${dir}/include/tbb)
+endforeach ()
+
+set(TBB_LIBRARY_NAMES tbb)
+get_debug_names(TBB_LIBRARY_NAMES)
+
+find_path(TBB_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_LIBRARY_RELEASE
+             NAMES ${TBB_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_LIBRARY_DEBUG
+             NAMES ${TBB_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_LIBRARY)
+
+findpkg_finish(TBB tbb)
+
+#if we haven't found TBB no point on going any further
+if (NOT TBB_FOUND)
+  return()
+endif ()
+
+#=============================================================================
+# Look for TBB's malloc package
+set(TBB_MALLOC_LIBRARY_NAMES tbbmalloc)
+get_debug_names(TBB_MALLOC_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_INCLUDE_DIR
+          NAMES tbb/tbb.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_LIBRARY)
+
+findpkg_finish(TBB_MALLOC tbbmalloc)
+
+#=============================================================================
+# Look for TBB's malloc proxy package
+set(TBB_MALLOC_PROXY_LIBRARY_NAMES tbbmalloc_proxy)
+get_debug_names(TBB_MALLOC_PROXY_LIBRARY_NAMES)
+
+find_path(TBB_MALLOC_PROXY_INCLUDE_DIR
+          NAMES tbb/tbbmalloc_proxy.h
+          PATHS ${TBB_INC_SEARCH_PATH})
+
+find_library(TBB_MALLOC_PROXY_LIBRARY_RELEASE
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+find_library(TBB_MALLOC_PROXY_LIBRARY_DEBUG
+             NAMES ${TBB_MALLOC_PROXY_LIBRARY_NAMES_DEBUG}
+             PATHS ${TBB_LIB_SEARCH_PATH})
+make_library_set(TBB_MALLOC_PROXY_LIBRARY)
+
+findpkg_finish(TBB_MALLOC_PROXY tbbmalloc_proxy)
+
+
+#=============================================================================
+# Parse all the version numbers from tbb.
+if(NOT TBB_VERSION)
+  if(EXISTS "${TBB_INCLUDE_DIR}/tbb/version.h")
+    # The newer oneTBB provides tbb/version.h but no tbb/tbb_stddef.h.
+    set(version_file "${TBB_INCLUDE_DIR}/tbb/version.h")
+  else()
+    # Older TBB provides tbb/tbb_stddef.h but no tbb/version.h.
+    set(version_file "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h")
+  endif()
+
+  file(STRINGS
+      "${version_file}"
+      TBB_VERSION_CONTENTS
+      REGEX "VERSION")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MAJOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+    ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1"
+    TBB_VERSION_MINOR "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  string(REGEX REPLACE
+        ".*#define TBB_COMPATIBLE_INTERFACE_VERSION ([0-9]+).*" "\\1"
+        TBB_COMPATIBLE_INTERFACE_VERSION "${TBB_VERSION_CONTENTS}")
+
+  set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}")
+endif()
diff --git a/thrust/cmake/README.md b/thrust/cmake/README.md
new file mode 100644
index 000000000..ae296b635
--- /dev/null
+++ b/thrust/cmake/README.md
@@ -0,0 +1,226 @@
+# Using Thrust with CMake
+
+Thrust provides configuration files that simplify using Thrust
+from other CMake projects. Requirements:
+
+- Thrust >= 1.9.10
+- CMake >= 3.15
+
+See the [Fixing Legacy FindThrust.cmake](#fixing-legacy-findthrustcmake)
+section for solutions that work on older Thrust versions.
+
+## User Guide
+
+#### Default Configuration (CUDA)
+
+Thrust is configured using a `thrust_create_target` CMake function that
+assembles a complete interface to the Thrust library:
+
+```cmake
+find_package(Thrust REQUIRED CONFIG)
+thrust_create_target(Thrust)
+target_link_libraries(MyProgram Thrust)
+```
+
+The first argument is the name of the interface target to create, and any
+additional options will be used to configure the target. By default,
+`thrust_create_target` will configure its result to use CUDA acceleration.
+
+If desired, `thrust_create_target` may be called multiple times to build
+several unique Thrust interface targets with different configurations, as
+detailed below.
+
+**Note:** If CMake is unable to locate Thrust, specify the path to Thrust's CMake
+configuration directory (where this README file is located) as `Thrust_DIR`.
+If cloning Thrust from github, this would be
+
+```
+$ cmake . -DThrust_DIR=<thrust git repo root>/thrust/cmake/
+```
+
+#### TBB / OpenMP
+
+To explicitly specify host/device systems, `HOST` and `DEVICE` arguments can be
+passed to `thrust_create_target`. If an explicit system is not specified, the
+target will default to using CPP for host and/or CUDA for device.
+
+```cmake
+thrust_create_target(ThrustTBB DEVICE TBB)
+thrust_create_target(ThrustOMP HOST CPP DEVICE OMP)
+```
+
+will create targets `ThrustTBB` and `ThrustOMP`. Both will use the serial `CPP`
+host system, but will find and use TBB or OpenMP for the device system.
+
+#### Configure Target from Cache Options
+
+To allow a Thrust target to be configurable easily via `cmake-gui` or
+`ccmake`, pass the `FROM_OPTIONS` flag to `thrust_create_target`. This will add
+`THRUST_HOST_SYSTEM` and `THRUST_DEVICE_SYSTEM` options to the CMake cache that
+allow selection from the systems supported by this version of Thrust.
+
+```cmake
+thrust_create_target(Thrust FROM_OPTIONS
+  [HOST_OPTION <option name>]
+  [DEVICE_OPTION <option name>]
+  [HOST_OPTION_DOC <doc string>]
+  [DEVICE_OPTION_DOC <doc string>]
+  [HOST <default host system name>]
+  [DEVICE <default device system name>]
+  [ADVANCED]
+)
+```
+
+The optional arguments have sensible defaults, but may be configured per
+`thrust_create_target` call:
+
+| Argument            | Default                 | Description                     |
+|---------------------|-------------------------|---------------------------------|
+| `HOST_OPTION`       | `THRUST_HOST_SYSTEM`    | Name of cache option for host   |
+| `DEVICE_OPTION`     | `THRUST_DEVICE_SYSTEM`  | Name of cache option for device |
+| `HOST_OPTION_DOC`   | Thrust's host system.   | Docstring for host option       |
+| `DEVICE_OPTION_DOC` | Thrust's device system. | Docstring for device option     |
+| `HOST`              | `CPP`                   | Default host system             |
+| `DEVICE`            | `CUDA`                  | Default device system           |
+| `ADVANCED`          | *N/A*                   | Mark cache options advanced     |
+
+### Specifying Thrust Version Requirements
+
+A specific version of Thrust may be required in the `find_package` call:
+
+```cmake
+find_package(Thrust 1.9.10)
+```
+
+will only consider Thrust installations with version `1.9.10.X`. An exact match
+down to the patch version can be forced by using `EXACT` matching:
+
+```cmake
+find_package(Thrust 1.9.10.1 EXACT)
+```
+
+would only match the 1.9.10.1 release.
+
+#### Using an Explicit TBB or OpenMP CMake Target
+
+When `thrust_create_target` is called, it will lazily load the requested
+systems on-demand through internal `find_package` calls. If a project already
+uses TBB or OpenMP, it may specify a CMake target for Thrust to share instead:
+
+```cmake
+thrust_set_TBB_target(MyTBBTarget)
+thrust_set_OMP_target(MyOMPTarget)
+```
+
+These functions must be called **before** the corresponding system is loaded
+through `thrust_create_target` or `find_package(Thrust COMPONENT [OMP|TBB])`.
+
+#### Using an Explicit libcu++ CMake Target
+
+In contrast to the optional TBB/OMP dependencies, there is no
+`thrust_set_libcudacxx_target` function that specifies an explicit libcu++
+target. This is because libcu++ is always required and must be found during the
+initial `find_target(Thrust)` call that defines these functions.
+
+To force Thrust to use a specific libcu++ target, ensure that either the
+`Thrust::libcudacxx` or `libcudacxx::libcudacxx` targets are defined prior to
+the first invocation of `find_package(Thrust)`. Thrust will automatically use
+these, giving preference to the `Thrust::libcudacxx` target.
+
+#### Testing for Systems
+
+The following functions check if a system has been found, either by lazy loading
+through `thrust_create_target` or as a `find_package` `COMPONENT` /
+`OPTIONAL_COMPONENT`:
+
+```cmake
+# Set var_name to TRUE or FALSE if an individual system has been found:
+thrust_is_cuda_system_found(<var_name>)
+thrust_is_cpp_system_found(<var_name>)
+thrust_is_tbb_system_found(<var_name>)
+thrust_is_omp_system_found(<var_name>)
+
+# Generic version that takes a component name from CUDA, CPP, TBB, OMP:
+thrust_is_system_found(<component_name> <var_name>)
+
+# Defines `THRUST_*_FOUND` variables in the current scope that reflect the
+# state of all known systems. Can be used to refresh these flags after
+# lazy system loading.
+thrust_update_system_found_flags()
+```
+
+#### Debugging
+
+Thrust will produce a detailed log describing its targets, cache options, and
+interfaces when `--log-level=VERBOSE` is passed to CMake 3.15.7 or newer:
+
+```
+$ cmake . --log-level=VERBOSE
+```
+
+This can be handy for inspecting interface and dependency information.
+
+## Fixing Legacy FindThrust.cmake
+
+A community-created `FindThrust.cmake` module exists and is necessary to find
+Thrust installations prior to Thrust 1.9.10. Its usage is discouraged whenever
+possible and the config files in this directory should be strongly preferred.
+However, projects that need to support old versions of Thrust may still need to
+use the legacy `FindThrust.cmake` with pre-1.9.10 installations.
+
+One popular flavor of this find module has a version parsing bug. Projects that
+rely on `FindThrust.cmake` should check for this and patch their copies as
+follows.
+
+Replace:
+
+```cmake
+string( REGEX MATCH "^[0-9]" major ${version} )
+string( REGEX REPLACE "^${major}00" "" version "${version}" )
+string( REGEX MATCH "^[0-9]" minor ${version} )
+string( REGEX REPLACE "^${minor}0" "" version "${version}" )
+```
+
+with:
+
+```cmake
+math(EXPR major "${version} / 100000")
+math(EXPR minor "(${version} / 100) % 1000")
+math(EXPR version "${version} % 100")
+```
+
+# Thrust Developer Documentation
+
+This portion of the file contains descriptions of Thrust's internal CMake target
+structure for Thrust developers. It should not be necessary for users
+who just want to use Thrust from their projects.
+
+## Internal Targets
+
+By default, `find_package(Thrust)` will only create a single `Thrust::Thrust`
+target that describes where the actual Thrust headers are located. It does not
+locate or create configurations for any dependencies; these are lazily loaded
+on-demand by calls to `create_thrust_target`, or when explicitly requested via
+`find_package`'s component mechanism.
+
+As mentioned, the basic Thrust interface is described by the `Thrust::Thrust`
+target.
+
+Each backend system (`CPP`, `CUDA`, `TBB`, `OMP`) is described by multiple
+targets:
+
+- `Thrust::${system}`
+  - Specifies an interface configured to build against all
+    dependencies for this backend (including `Thrust::Thrust`).
+  - For example, the `Thrust::CUDA` target is an interface
+    target that combines the interfaces of both Thrust and CUB.
+- `Thrust::${system}::Host`
+  - Configures an interface for using a specific host system.
+  - Multiple `::Host` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the host.
+- `Thrust::${system}::Device`
+  - Configures an interface for using a specific device system.
+  - Multiple `::Device` targets cannot be combined in the same library/executable.
+    Attempting to do so will produce a CMake configuration error.
+  - Only defined for systems that support being used as the device.
diff --git a/thrust/cmake/thrust-config-version.cmake b/thrust/cmake/thrust-config-version.cmake
new file mode 100644
index 000000000..cf9407a4c
--- /dev/null
+++ b/thrust/cmake/thrust-config-version.cmake
@@ -0,0 +1,31 @@
+# Parse version information from version.h:
+include("${CMAKE_CURRENT_LIST_DIR}/thrust-header-search.cmake")
+
+file(READ "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h" THRUST_VERSION_HEADER)
+string(REGEX MATCH "#define[ \t]+THRUST_VERSION[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_FLAT ${CMAKE_MATCH_1})
+# Note that Thrust calls this the PATCH number, CMake calls it the TWEAK number:
+string(REGEX MATCH "#define[ \t]+THRUST_PATCH_NUMBER[ \t]+([0-9]+)" DUMMY "${THRUST_VERSION_HEADER}")
+set(THRUST_VERSION_TWEAK ${CMAKE_MATCH_1})
+
+math(EXPR THRUST_VERSION_MAJOR "${THRUST_VERSION_FLAT} / 100000")
+math(EXPR THRUST_VERSION_MINOR "(${THRUST_VERSION_FLAT} / 100) % 1000")
+math(EXPR THRUST_VERSION_PATCH "${THRUST_VERSION_FLAT} % 100") # Thrust: "subminor" CMake: "patch"
+
+set(THRUST_VERSION "${THRUST_VERSION_MAJOR}.${THRUST_VERSION_MINOR}.${THRUST_VERSION_PATCH}.${THRUST_VERSION_TWEAK}")
+
+set(PACKAGE_VERSION ${THRUST_VERSION})
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_UNSUITABLE FALSE)
+
+if(PACKAGE_VERSION VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION)
+  if(THRUST_VERSION_MAJOR VERSION_EQUAL PACKAGE_FIND_VERSION_MAJOR AND
+     THRUST_VERSION_MINOR VERSION_GREATER_EQUAL PACKAGE_FIND_VERSION_MINOR)
+    set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  endif()
+
+  if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+    set(PACKAGE_VERSION_EXACT TRUE)
+  endif()
+endif()
diff --git a/thrust/cmake/thrust-config.cmake b/thrust/cmake/thrust-config.cmake
new file mode 100644
index 000000000..fe88a961c
--- /dev/null
+++ b/thrust/cmake/thrust-config.cmake
@@ -0,0 +1,745 @@
+#
+# find_package(Thrust) config file.
+#
+# Provided by NVIDIA under the same license as the associated Thrust library.
+#
+# Reply-To: Allison Vacanti <alliepiper16@gmail.com>
+#
+# *****************************************************************************
+# **     The following is a short reference to using Thrust from CMake.      **
+# ** For more details, see the README.md in the same directory as this file. **
+# *****************************************************************************
+#
+# # General Usage:
+# find_package(Thrust REQUIRED CONFIG)
+# thrust_create_target(Thrust [options])
+# target_link_libraries(some_project_lib Thrust)
+#
+# # Create default target with: HOST=CPP DEVICE=CUDA
+# thrust_create_target(TargetName)
+#
+# # Create target with: HOST=CPP DEVICE=TBB
+# thrust_create_target(TargetName DEVICE TBB)
+#
+# # Create target with: HOST=TBB DEVICE=OMP
+# thrust_create_target(TargetName HOST TBB DEVICE OMP)
+#
+# # Create CMake cache options THRUST_[HOST|DEVICE]_SYSTEM and configure a
+# # target from them. This allows these systems to be changed by developers at
+# # configure time, per build.
+# thrust_create_target(TargetName FROM_OPTIONS
+#   [HOST_OPTION <option_name>]      # Optionally rename the host system option
+#   [DEVICE_OPTION <option_name>]    # Optionally rename the device system option
+#   [HOST_OPTION_DOC <doc_string>]   # Optionally change the cache label
+#   [DEVICE_OPTION_DOC <doc_string>] # Optionally change the cache label
+#   [HOST <default system>]          # Optionally change the default backend
+#   [DEVICE <default system>]        # Optionally change the default backend
+#   [ADVANCED]                       # Optionally mark options as advanced
+# )
+#
+# # Use a custom TBB, CUB, and/or OMP
+# # (Note that once set, these cannot be changed. This includes COMPONENT
+# # preloading and lazy lookups in thrust_create_target)
+# find_package(Thrust REQUIRED)
+# thrust_set_CUB_target(MyCUBTarget)  # MyXXXTarget contains an existing
+# thrust_set_TBB_target(MyTBBTarget)  # interface to XXX for Thrust to use.
+# thrust_set_OMP_target(MyOMPTarget)
+# thrust_create_target(ThrustWithMyCUB DEVICE CUDA)
+# thrust_create_target(ThrustWithMyTBB DEVICE TBB)
+# thrust_create_target(ThrustWithMyOMP DEVICE OMP)
+#
+# # Create target with HOST=CPP DEVICE=CUDA and some advanced flags set
+# thrust_create_target(TargetName
+#   IGNORE_DEPRECATED_API         # Silence build warnings about deprecated APIs
+#   IGNORE_DEPRECATED_CPP_DIALECT # Silence build warnings about deprecated compilers and C++ standards
+#   IGNORE_DEPRECATED_CPP_11      # Only silence deprecation warnings for C++11
+#   IGNORE_DEPRECATED_COMPILER    # Only silence deprecation warnings for old compilers
+#   IGNORE_CUB_VERSION            # Skip configure-time and compile-time CUB version checks
+# )
+#
+# # Test if a particular system has been loaded. ${var_name} is set to TRUE or
+# # FALSE to indicate if "system" is found.
+# thrust_is_system_found(<system> <var_name>)
+# thrust_is_cuda_system_found(<var_name>)
+# thrust_is_tbb_system_found(<var_name>)
+# thrust_is_omp_system_found(<var_name>)
+# thrust_is_cpp_system_found(<var_name>)
+#
+# # Define / update THRUST_${system}_FOUND flags in current scope
+# thrust_update_system_found_flags()
+#
+# # View verbose log with target and dependency information:
+# $ cmake . --log-level=VERBOSE (CMake 3.15.7 and above)
+#
+# # Print debugging output to status channel:
+# thrust_debug_internal_targets()
+# thrust_debug_target(TargetName "${THRUST_VERSION}")
+
+cmake_minimum_required(VERSION 3.15)
+
+# Minimum supported libcudacxx version:
+set(thrust_libcudacxx_version 1.8.0)
+
+################################################################################
+# User variables and APIs. Users can rely on these:
+#
+
+# Advertise system options:
+set(THRUST_HOST_SYSTEM_OPTIONS
+  CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust host systems."
+  FORCE
+)
+set(THRUST_DEVICE_SYSTEM_OPTIONS
+  CUDA CPP OMP TBB
+  CACHE INTERNAL "Valid Thrust device systems"
+  FORCE
+)
+
+# Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+set(THRUST_VERSION ${${CMAKE_FIND_PACKAGE_NAME}_VERSION} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_MAJOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MAJOR} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_MINOR ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_MINOR} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_PATCH ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_PATCH} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_TWEAK ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_TWEAK} CACHE INTERNAL "" FORCE)
+set(THRUST_VERSION_COUNT ${${CMAKE_FIND_PACKAGE_NAME}_VERSION_COUNT} CACHE INTERNAL "" FORCE)
+
+function(thrust_create_target target_name)
+  thrust_debug("Assembling target ${target_name}. Options: ${ARGN}" internal)
+  set(options
+    ADVANCED
+    FROM_OPTIONS
+    IGNORE_CUB_VERSION_CHECK
+    IGNORE_DEPRECATED_API
+    IGNORE_DEPRECATED_COMPILER
+    IGNORE_DEPRECATED_CPP_11
+    IGNORE_DEPRECATED_CPP_DIALECT
+  )
+  set(keys
+    DEVICE
+    DEVICE_OPTION
+    DEVICE_OPTION_DOC
+    HOST
+    HOST_OPTION
+    HOST_OPTION_DOC
+  )
+  cmake_parse_arguments(TCT "${options}" "${keys}" "" ${ARGN})
+  if (TCT_UNPARSED_ARGUMENTS)
+    message(AUTHOR_WARNING
+      "Unrecognized arguments passed to thrust_create_target: "
+      ${TCT_UNPARSED_ARGUMENTS}
+    )
+  endif()
+
+  # Check that the main Thrust internal target is available
+  # (functions have global scope, targets have directory scope, so this
+  # might happen)
+  if (NOT TARGET Thrust::Thrust)
+    message(AUTHOR_WARNING
+      "The `thrust_create_target` function was called outside the scope of the "
+      "thrust targets. Call find_package again to recreate targets."
+    )
+  endif()
+
+  _thrust_set_if_undefined(TCT_HOST CPP)
+  _thrust_set_if_undefined(TCT_DEVICE CUDA)
+  _thrust_set_if_undefined(TCT_HOST_OPTION THRUST_HOST_SYSTEM)
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION THRUST_DEVICE_SYSTEM)
+  _thrust_set_if_undefined(TCT_HOST_OPTION_DOC "Thrust host system.")
+  _thrust_set_if_undefined(TCT_DEVICE_OPTION_DOC "Thrust device system.")
+
+  if (NOT TCT_HOST IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested HOST=${TCT_HOST}; must be one of ${THRUST_HOST_SYSTEM_OPTIONS}"
+    )
+  endif()
+
+  if (NOT TCT_DEVICE IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR
+      "Requested DEVICE=${TCT_DEVICE}; must be one of ${THRUST_DEVICE_SYSTEM_OPTIONS}"
+    )
+  endif()
+
+  if (TCT_FROM_OPTIONS)
+    _thrust_create_cache_options(
+      ${TCT_HOST} ${TCT_DEVICE}
+      ${TCT_HOST_OPTION} ${TCT_DEVICE_OPTION}
+      ${TCT_HOST_OPTION_DOC} ${TCT_DEVICE_OPTION_DOC}
+      ${TCT_ADVANCED}
+    )
+    set(TCT_HOST ${${TCT_HOST_OPTION}})
+    set(TCT_DEVICE ${${TCT_DEVICE_OPTION}})
+    thrust_debug("Current option settings:" internal)
+    thrust_debug("  - ${TCT_HOST_OPTION}=${TCT_HOST}" internal)
+    thrust_debug("  - ${TCT_DEVICE_OPTION}=${TCT_DEVICE}" internal)
+  endif()
+
+  _thrust_find_backend(${TCT_HOST} REQUIRED)
+  _thrust_find_backend(${TCT_DEVICE} REQUIRED)
+
+  # We can just create an INTERFACE IMPORTED target here instead of going
+  # through _thrust_declare_interface_alias as long as we aren't hanging any
+  # Thrust/CUB include paths directly on ${target_name}.
+  add_library(${target_name} INTERFACE IMPORTED)
+  target_link_libraries(${target_name}
+    INTERFACE
+    Thrust::${TCT_HOST}::Host
+    Thrust::${TCT_DEVICE}::Device
+  )
+
+  # This would be nice to enforce, but breaks when using old cmake + new
+  # compiler, since cmake doesn't know what features the new compiler version
+  # supports.
+  # Leaving this here as a reminder not to add it back. Just let the
+  # compile-time checks in thrust/detail/config/cpp_dialect.h handle it.
+  #
+  #  if (NOT TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+  #    if (TCT_IGNORE_DEPRECATED_CPP_11)
+  #      target_compile_features(${target_name} INTERFACE cxx_std_11)
+  #    else()
+  #      target_compile_features(${target_name} INTERFACE cxx_std_14)
+  #    endif()
+  #  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_DIALECT)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_DIALECT")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_API)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_API")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_CPP_11)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_CPP_11")
+  endif()
+
+  if (TCT_IGNORE_DEPRECATED_COMPILER)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_DEPRECATED_COMPILER")
+  endif()
+
+  if (TCT_IGNORE_CUB_VERSION_CHECK)
+    target_compile_definitions(${target_name} INTERFACE "THRUST_IGNORE_CUB_VERSION_CHECK")
+  else()
+    if (("${TCT_HOST}" STREQUAL "CUDA" OR "${TCT_DEVICE}" STREQUAL "CUDA") AND
+    (NOT THRUST_VERSION VERSION_EQUAL THRUST_CUB_VERSION))
+      message(FATAL_ERROR
+        "The version of CUB found by CMake is not compatible with this release of Thrust. "
+        "CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. "
+        "Pass IGNORE_CUB_VERSION_CHECK to thrust_create_target to ignore. "
+        "(CUB ${THRUST_CUB_VERSION}, Thrust ${THRUST_VERSION})."
+        )
+    endif()
+  endif()
+
+  thrust_debug_target(${target_name} "Thrust ${THRUST_VERSION}"  internal)
+endfunction()
+
+function(thrust_is_system_found system var_name)
+  if (TARGET Thrust::${system})
+    set(${var_name} TRUE PARENT_SCOPE)
+  else()
+    set(${var_name} FALSE PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(thrust_is_cpp_system_found var_name)
+  thrust_is_system_found(CPP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_cuda_system_found var_name)
+  thrust_is_system_found(CUDA ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_tbb_system_found var_name)
+  thrust_is_system_found(TBB ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+function(thrust_is_omp_system_found var_name)
+  thrust_is_system_found(OMP ${var_name})
+  set(${var_name} ${${var_name}} PARENT_SCOPE)
+endfunction()
+
+# Since components are loaded lazily, this will refresh the
+# THRUST_${component}_FOUND flags in the current scope.
+# Alternatively, check system states individually using the
+# thrust_is_system_found functions.
+macro(thrust_update_system_found_flags)
+  set(THRUST_FOUND TRUE)
+  thrust_is_system_found(CPP  THRUST_CPP_FOUND)
+  thrust_is_system_found(CUDA THRUST_CUDA_FOUND)
+  thrust_is_system_found(TBB  THRUST_TBB_FOUND)
+  thrust_is_system_found(OMP  THRUST_OMP_FOUND)
+endmacro()
+
+function(thrust_debug msg)
+  # Use the VERBOSE channel when called internally
+  # Run `cmake . --log-level=VERBOSE` to view.
+  if ("${ARGN}" STREQUAL "internal")
+    # If CMake is too old to know about the VERBOSE channel, just be silent.
+    # Users reproduce much the same output on the STATUS channel by using:
+    # thrust_create_target(Thrust [...])
+    # thrust_debug_internal_targets()
+    # thrust_debug_target(Thrust)
+    if (CMAKE_VERSION VERSION_GREATER_EQUAL "3.15.7")
+      set(channel VERBOSE)
+    else()
+      return()
+    endif()
+  else()
+    set(channel STATUS)
+  endif()
+
+  message(${channel} "Thrust: ${msg}")
+endfunction()
+
+# Print details of the specified target.
+function(thrust_debug_target target_name version)
+  if (NOT TARGET ${target_name})
+    return()
+  endif()
+
+  set(is_internal "${ARGN}")
+
+  if (version)
+    set(version "(${version})")
+  endif()
+
+  thrust_debug("TargetInfo: ${target_name}: ${version}" ${is_internal})
+
+  function(_thrust_print_prop_if_set target_name prop)
+    get_target_property(value ${target_name} ${prop})
+    if (value)
+      thrust_debug("TargetInfo: ${target_name} > ${prop}: ${value}" ${is_internal})
+    endif()
+  endfunction()
+
+  function(_thrust_print_imported_prop_if_set target_name prop)
+    get_target_property(imported ${target_name} IMPORTED)
+    get_target_property(type ${target_name} TYPE)
+    if (imported AND NOT ${type} STREQUAL "INTERFACE_LIBRARY")
+      _thrust_print_prop_if_set(${target_name} ${prop})
+    endif()
+  endfunction()
+
+  _thrust_print_prop_if_set(${target_name} ALIASED_TARGET)
+  _thrust_print_prop_if_set(${target_name} IMPORTED)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_DEFINITIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_FEATURES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_COMPILE_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DEPENDS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_LIBRARIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_LINK_OPTIONS)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_HOST)
+  _thrust_print_prop_if_set(${target_name} INTERFACE_THRUST_DEVICE)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_DEBUG)
+  _thrust_print_imported_prop_if_set(${target_name} IMPORTED_LOCATION_RELEASE)
+endfunction()
+
+function(thrust_debug_internal_targets)
+  function(_thrust_debug_backend_targets backend version)
+    thrust_debug_target(Thrust::${backend} "${version}")
+    thrust_debug_target(Thrust::${backend}::Host "${version}")
+    thrust_debug_target(Thrust::${backend}::Device "${version}")
+  endfunction()
+
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(CPP "Thrust ${THRUST_VERSION}")
+
+  _thrust_debug_backend_targets(OMP "${THRUST_OMP_VERSION}")
+  thrust_debug_target(OpenMP::OpenMP_CXX "${THRUST_OMP_VERSION}")
+
+  _thrust_debug_backend_targets(TBB "${THRUST_TBB_VERSION}")
+  thrust_debug_target(TBB:tbb "${THRUST_TBB_VERSION}")
+
+  _thrust_debug_backend_targets(CUDA "CUB ${THRUST_CUB_VERSION}")
+  thrust_debug_target(CUB::CUB "${THRUST_CUB_VERSION}")
+  thrust_debug_target(libcudacxx::libcudacxx "${THRUST_libcudacxx_VERSION}")
+endfunction()
+
+################################################################################
+# Internal utilities. Subject to change.
+#
+
+function(_thrust_set_if_undefined var)
+  if (NOT DEFINED ${var})
+    set(${var} ${ARGN} PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_thrust_declare_interface_alias alias_name ugly_name)
+  # 1) Only IMPORTED and ALIAS targets can be placed in a namespace.
+  # 2) When an IMPORTED library is linked to another target, its include
+  #    directories are treated as SYSTEM includes.
+  # 3) nvcc will automatically check the CUDA Toolkit include path *before* the
+  #    system includes. This means that the Toolkit Thrust will *always* be used
+  #    during compilation, and the include paths of an IMPORTED Thrust::Thrust
+  #    target will never have any effect.
+  # 4) This behavior can be fixed by setting the property NO_SYSTEM_FROM_IMPORTED
+  #    on EVERY target that links to Thrust::Thrust. This would be a burden and a
+  #    footgun for our users. Forgetting this would silently pull in the wrong thrust!
+  # 5) A workaround is to make a non-IMPORTED library outside of the namespace,
+  #    configure it, and then ALIAS it into the namespace (or ALIAS and then
+  #    configure, that seems to work too).
+  add_library(${ugly_name} INTERFACE)
+  add_library(${alias_name} ALIAS ${ugly_name})
+endfunction()
+
+# Create cache options for selecting the user/device systems with ccmake/cmake-gui.
+function(_thrust_create_cache_options host device host_option device_option host_doc device_doc advanced)
+  thrust_debug("Creating system cache options: (advanced=${advanced})" internal)
+  thrust_debug("  - Host Option=${host_option} Default=${host} Doc='${host_doc}'" internal)
+  thrust_debug("  - Device Option=${device_option} Default=${device} Doc='${device_doc}'" internal)
+  set(${host_option} ${host} CACHE STRING "${host_doc}")
+  set_property(CACHE ${host_option} PROPERTY STRINGS ${THRUST_HOST_SYSTEM_OPTIONS})
+  set(${device_option} ${device} CACHE STRING "${device_doc}")
+  set_property(CACHE ${device_option} PROPERTY STRINGS ${THRUST_DEVICE_SYSTEM_OPTIONS})
+  if (advanced)
+    mark_as_advanced(${host_option} ${device_option})
+  endif()
+endfunction()
+
+# Create Thrust::${backend}::Host and Thrust::${backend}::Device targets.
+# Assumes that `Thrust::${backend}` and `_Thrust_${backend}` have been created
+# by _thrust_declare_interface_alias and configured to bring in system
+# dependency interfaces (including Thrust::Thrust).
+function(_thrust_setup_system backend)
+  set(backend_target_alias "Thrust::${backend}")
+
+  if (backend IN_LIST THRUST_HOST_SYSTEM_OPTIONS)
+    set(host_target "_Thrust_${backend}_Host")
+    set(host_target_alias "Thrust::${backend}::Host")
+    if (NOT TARGET ${host_target_alias})
+      _thrust_declare_interface_alias(${host_target_alias} ${host_target})
+      target_compile_definitions(${host_target} INTERFACE
+        "THRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_${backend}")
+      target_link_libraries(${host_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${host_target} PROPERTY INTERFACE_THRUST_HOST ${backend})
+      set_property(TARGET ${host_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_HOST)
+      thrust_debug_target(${host_target_alias} "" internal)
+    endif()
+  endif()
+
+  if (backend IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    set(device_target "_Thrust_${backend}_Device")
+    set(device_target_alias "Thrust::${backend}::Device")
+    if (NOT TARGET ${device_target_alias})
+      _thrust_declare_interface_alias(${device_target_alias} ${device_target})
+      target_compile_definitions(${device_target} INTERFACE
+        "THRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_${backend}")
+      target_link_libraries(${device_target} INTERFACE ${backend_target_alias})
+      set_property(TARGET ${device_target} PROPERTY INTERFACE_THRUST_DEVICE ${backend})
+      set_property(TARGET ${device_target} APPEND PROPERTY COMPATIBLE_INTERFACE_STRING THRUST_DEVICE)
+      thrust_debug_target(${device_target_alias} "" internal)
+    endif()
+  endif()
+endfunction()
+
+# Use the provided cub_target for the CUDA backend. If Thrust::CUB already
+# exists, this call has no effect.
+function(thrust_set_CUB_target cub_target)
+  if (NOT TARGET Thrust::CUB)
+    thrust_debug("Setting CUB target to ${cub_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_CUB_VERSION ${CUB_VERSION} CACHE INTERNAL
+      "CUB version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::CUB _Thrust_CUB)
+    target_link_libraries(_Thrust_CUB INTERFACE ${cub_target})
+    thrust_debug_target(${cub_target} "${THRUST_CUB_VERSION}" internal)
+    thrust_debug_target(Thrust::CUB "CUB ${THRUST_CUB_VERSION}" internal)
+  endif()
+endfunction()
+
+# Internal use only -- libcudacxx must be found during the initial
+# `find_package(Thrust)` call and cannot be set afterwards. See README.md in
+# this directory for details on using a specific libcudacxx target.
+function(_thrust_set_libcudacxx_target libcudacxx_target)
+  if (NOT TARGET Thrust::libcudacxx)
+    thrust_debug("Setting libcudacxx target to ${libcudacxx_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_libcudacxx_VERSION ${libcudacxx_VERSION} CACHE INTERNAL
+      "libcudacxx version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::libcudacxx _Thrust_libcudacxx)
+    target_link_libraries(_Thrust_libcudacxx INTERFACE ${libcudacxx_target})
+    thrust_debug_target(${libcudacxx_target} "${THRUST_libcudacxx_VERSION}" internal)
+    thrust_debug_target(Thrust::libcudacxx "libcudacxx ${THRUST_libcudacxx_VERSION}" internal)
+  endif()
+endfunction()
+
+# Use the provided tbb_target for the TBB backend. If Thrust::TBB already
+# exists, this call has no effect.
+function(thrust_set_TBB_target tbb_target)
+  if (NOT TARGET Thrust::TBB)
+    thrust_debug("Setting TBB target to ${tbb_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_TBB_VERSION ${TBB_VERSION} CACHE INTERNAL
+      "TBB version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::TBB _Thrust_TBB)
+    target_link_libraries(_Thrust_TBB INTERFACE Thrust::Thrust ${tbb_target})
+    thrust_debug_target(${tbb_target} "${THRUST_TBB_VERSION}" internal)
+    thrust_debug_target(Thrust::TBB "${THRUST_TBB_VERSION}" internal)
+    _thrust_setup_system(TBB)
+  endif()
+endfunction()
+
+# Use the provided omp_target for the OMP backend. If Thrust::OMP already
+# exists, this call has no effect.
+function(thrust_set_OMP_target omp_target)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Setting OMP target to ${omp_target}" internal)
+    # Workaround cmake issue #20670 https://gitlab.kitware.com/cmake/cmake/-/issues/20670
+    set(THRUST_OMP_VERSION ${OpenMP_CXX_VERSION} CACHE INTERNAL
+      "OpenMP version used by Thrust"
+      FORCE
+    )
+    _thrust_declare_interface_alias(Thrust::OMP _Thrust_OMP)
+    target_link_libraries(_Thrust_OMP INTERFACE Thrust::Thrust ${omp_target})
+    thrust_debug_target(${omp_target} "${THRUST_OMP_VERSION}" internal)
+    thrust_debug_target(Thrust::OMP "${THRUST_OMP_VERSION}" internal)
+    _thrust_setup_system(OMP)
+  endif()
+endfunction()
+
+function(_thrust_find_CPP required)
+  if (NOT TARGET Thrust::CPP)
+    thrust_debug("Generating CPP targets." internal)
+    _thrust_declare_interface_alias(Thrust::CPP _Thrust_CPP)
+    target_link_libraries(_Thrust_CPP INTERFACE Thrust::Thrust)
+    thrust_debug_target(Thrust::CPP "Thrust ${THRUST_VERSION}" internal)
+    _thrust_setup_system(CPP)
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_CUDA required)
+  if (NOT TARGET Thrust::CUB)
+    thrust_debug("Searching for CUB ${required}" internal)
+    find_package(CUB ${THRUST_VERSION} CONFIG
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+      NO_DEFAULT_PATH # Only check the explicit HINTS below:
+      HINTS
+        "${_THRUST_INCLUDE_DIR}/dependencies/cub" # Source layout (GitHub)
+        "${_THRUST_INCLUDE_DIR}/../cub/cub/cmake" # Source layout (Perforce)
+        "${_THRUST_CMAKE_DIR}/.."                 # Install layout
+    )
+
+    if (TARGET CUB::CUB)
+      thrust_set_CUB_target(CUB::CUB)
+    else()
+      thrust_debug("CUB not found!" internal)
+    endif()
+  endif()
+
+  if (NOT TARGET Thrust::CUDA)
+    _thrust_declare_interface_alias(Thrust::CUDA _Thrust_CUDA)
+    _thrust_setup_system(CUDA)
+    target_link_libraries(_Thrust_CUDA INTERFACE
+      Thrust::Thrust
+      Thrust::CUB
+    )
+    thrust_debug_target(Thrust::CUDA "" internal)
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like TBB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_TBB required)
+  if(NOT TARGET Thrust::TBB)
+    thrust_debug("Searching for TBB ${required}" internal)
+    # Swap in a temporary module path to make sure we use our FindTBB.cmake
+    set(_THRUST_STASH_MODULE_PATH "${CMAKE_MODULE_PATH}")
+    set(CMAKE_MODULE_PATH "${_THRUST_CMAKE_DIR}")
+
+    # Push policy CMP0074 to silence warnings about TBB_ROOT being set. This
+    # var is used unconventionally in this FindTBB.cmake module.
+    # Someday we'll have a suitable TBB cmake configuration and can avoid this.
+    cmake_policy(PUSH)
+    cmake_policy(SET CMP0074 OLD)
+    set(THRUST_TBB_ROOT "" CACHE PATH "Path to the root of the TBB installation.")
+    if (TBB_ROOT AND NOT THRUST_TBB_ROOT)
+      message(
+        "Warning: TBB_ROOT is set. "
+        "Thrust uses THRUST_TBB_ROOT to avoid issues with CMake Policy CMP0074. "
+        "Please set this variable instead when using Thrust with TBB."
+      )
+    endif()
+    set(TBB_ROOT "${THRUST_TBB_ROOT}")
+    set(_THRUST_STASH_TBB_ROOT "${TBB_ROOT}")
+
+    find_package(TBB
+      ${_THRUST_QUIET_FLAG}
+      ${required}
+    )
+
+    cmake_policy(POP)
+    set(TBB_ROOT "${_THRUST_STASH_TBB_ROOT}")
+    set(CMAKE_MODULE_PATH "${_THRUST_STASH_MODULE_PATH}")
+
+    if (TARGET TBB::tbb)
+      thrust_set_TBB_target(TBB::tbb)
+    else()
+      thrust_debug("TBB not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# Wrap the OpenMP flags for CUDA targets
+function(thrust_fixup_omp_target omp_target)
+  get_target_property(opts ${omp_target} INTERFACE_COMPILE_OPTIONS)
+  if (opts MATCHES "\\$<\\$<COMPILE_LANGUAGE:CXX>:([^>]*)>")
+    target_compile_options(${omp_target} INTERFACE
+      $<$<AND:$<COMPILE_LANGUAGE:CUDA>,$<CUDA_COMPILER_ID:NVIDIA>>:-Xcompiler=${CMAKE_MATCH_1}>
+    )
+  endif()
+endfunction()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like OpenMP_CXX_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_OMP required)
+  if (NOT TARGET Thrust::OMP)
+    thrust_debug("Searching for OMP ${required}" internal)
+    find_package(OpenMP
+      ${_THRUST_QUIET_FLAG}
+      ${_THRUST_REQUIRED_FLAG_OMP}
+      COMPONENTS CXX
+    )
+
+    if (TARGET OpenMP::OpenMP_CXX)
+      thrust_fixup_omp_target(OpenMP::OpenMP_CXX)
+      thrust_set_OMP_target(OpenMP::OpenMP_CXX)
+    else()
+      thrust_debug("OpenMP::OpenMP_CXX not found!" internal)
+    endif()
+  endif()
+endmacro()
+
+# This must be a macro instead of a function to ensure that backends passed to
+# find_package(Thrust COMPONENTS [...]) have their full configuration loaded
+# into the current scope. This provides at least some remedy for CMake issue
+# #20670 -- otherwise variables like CUB_VERSION, etc won't be in the caller's
+# scope.
+macro(_thrust_find_backend backend required)
+  # Unfortunately, _thrust_find_${backend}(req) is not valid CMake syntax. Hence
+  # why this function exists.
+  if ("${backend}" STREQUAL "CPP")
+    _thrust_find_CPP("${required}")
+  elseif ("${backend}" STREQUAL "CUDA")
+    _thrust_find_CUDA("${required}")
+  elseif ("${backend}" STREQUAL "TBB")
+    _thrust_find_TBB("${required}")
+  elseif ("${backend}" STREQUAL "OMP")
+    _thrust_find_OMP("${required}")
+  else()
+    message(FATAL_ERROR "_thrust_find_backend: Invalid system: ${backend}")
+  endif()
+endmacro()
+
+################################################################################
+# Initialization. Executed inside find_package(Thrust) call.
+#
+
+if (${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY)
+  set(_THRUST_QUIET ON CACHE INTERNAL "Quiet mode enabled for Thrust find_package calls." FORCE)
+  set(_THRUST_QUIET_FLAG "QUIET" CACHE INTERNAL "" FORCE)
+else()
+  unset(_THRUST_QUIET CACHE)
+  unset(_THRUST_QUIET_FLAG CACHE)
+endif()
+
+set(_THRUST_CMAKE_DIR "${CMAKE_CURRENT_LIST_DIR}" CACHE INTERNAL
+  "Location of thrust-config.cmake"
+  FORCE
+)
+
+# Internal target that actually holds the Thrust interface. Used by all other Thrust targets.
+if (NOT TARGET Thrust::Thrust)
+  _thrust_declare_interface_alias(Thrust::Thrust _Thrust_Thrust)
+  # Pull in the include dir detected by thrust-config-version.cmake
+  set(_THRUST_INCLUDE_DIR "${_THRUST_VERSION_INCLUDE_DIR}"
+    CACHE INTERNAL "Location of Thrust headers."
+    FORCE
+  )
+  unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear tmp variable from cache
+  target_include_directories(_Thrust_Thrust INTERFACE "${_THRUST_INCLUDE_DIR}")
+  thrust_debug_target(Thrust::Thrust "${THRUST_VERSION}" internal)
+endif()
+
+# Find libcudacxx prior to locating backend-specific deps. This ensures that CUB
+# finds the same package.
+if (NOT TARGET Thrust::libcudacxx)
+  thrust_debug("Searching for libcudacxx REQUIRED" internal)
+
+  # First do a non-required search for any co-packaged versions.
+  # These are preferred.
+  find_package(libcudacxx ${thrust_libcudacxx_version} CONFIG
+    ${_THRUST_QUIET_FLAG}
+    NO_DEFAULT_PATH # Only check the explicit HINTS below:
+    HINTS
+      "${_THRUST_INCLUDE_DIR}/dependencies/libcudacxx" # Source layout (GitHub)
+      "${_THRUST_INCLUDE_DIR}/../libcudacxx"           # Source layout (Perforce)
+      "${_THRUST_CMAKE_DIR}/.."                        # Install layout
+  )
+
+  # A second required search allows externally packaged to be used and fails if
+  # no suitable package exists.
+  find_package(libcudacxx ${thrust_libcudacxx_version} CONFIG
+    REQUIRED
+    ${_THRUST_QUIET_FLAG}
+  )
+
+  if (TARGET libcudacxx::libcudacxx)
+    _thrust_set_libcudacxx_target(libcudacxx::libcudacxx)
+  else()
+    thrust_debug("Expected libcudacxx::libcudacxx target not found!" internal)
+  endif()
+
+  target_link_libraries(_Thrust_Thrust INTERFACE Thrust::libcudacxx)
+endif()
+
+# Handle find_package COMPONENT requests:
+foreach(component ${${CMAKE_FIND_PACKAGE_NAME}_FIND_COMPONENTS})
+  if (NOT component IN_LIST THRUST_HOST_SYSTEM_OPTIONS AND
+      NOT component IN_LIST THRUST_DEVICE_SYSTEM_OPTIONS)
+    message(FATAL_ERROR "Invalid component requested: '${component}'")
+  endif()
+
+  unset(req)
+  if (${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED_${component})
+    set(req "REQUIRED")
+  endif()
+
+  thrust_debug("Preloading COMPONENT '${component}' ${req}" internal)
+  _thrust_find_backend(${component} "${req}")
+endforeach()
+
+thrust_update_system_found_flags()
+
+include(FindPackageHandleStandardArgs)
+if (NOT Thrust_CONFIG)
+  set(Thrust_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
+endif()
+find_package_handle_standard_args(Thrust CONFIG_MODE)
diff --git a/thrust/cmake/thrust-header-search.cmake b/thrust/cmake/thrust-header-search.cmake
new file mode 100644
index 000000000..3d69398a7
--- /dev/null
+++ b/thrust/cmake/thrust-header-search.cmake
@@ -0,0 +1,6 @@
+# Parse version information from version.h in source tree
+set(_THRUST_VERSION_INCLUDE_DIR "${CMAKE_CURRENT_LIST_DIR}/../..")
+if(EXISTS "${_THRUST_VERSION_INCLUDE_DIR}/thrust/version.h")
+  set(_THRUST_VERSION_INCLUDE_DIR "${_THRUST_VERSION_INCLUDE_DIR}" CACHE FILEPATH "" FORCE) # Clear old result
+  set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
+endif()
diff --git a/thrust/cmake/thrust-header-search.cmake.in b/thrust/cmake/thrust-header-search.cmake.in
new file mode 100644
index 000000000..c014c469b
--- /dev/null
+++ b/thrust/cmake/thrust-header-search.cmake.in
@@ -0,0 +1,18 @@
+# Parse version information from version.h:
+unset(_THRUST_VERSION_INCLUDE_DIR CACHE) # Clear old result to force search
+
+# Find CMAKE_INSTALL_INCLUDEDIR=@CMAKE_INSTALL_INCLUDEDIR@ directory"
+set(from_install_prefix "@install_location@")
+
+# Transform to a list of directories, replace each directoy with "../"
+# and convert back to a string
+string(REGEX REPLACE "/" ";" from_install_prefix "${from_install_prefix}")
+list(TRANSFORM from_install_prefix REPLACE ".+" "../")
+list(JOIN from_install_prefix "" from_install_prefix)
+
+find_path(_THRUST_VERSION_INCLUDE_DIR thrust/version.h
+  NO_DEFAULT_PATH # Only search explicit paths below:
+  PATHS
+    "${CMAKE_CURRENT_LIST_DIR}/${from_install_prefix}/@CMAKE_INSTALL_INCLUDEDIR@"
+)
+set_property(CACHE _THRUST_VERSION_INCLUDE_DIR PROPERTY TYPE INTERNAL)
diff --git a/thrust/complex.h b/thrust/complex.h
index 124cf31e6..8c0be0d61 100644
--- a/thrust/complex.h
+++ b/thrust/complex.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2019 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,19 +28,32 @@
 #include <sstream>
 #include <thrust/detail/type_traits.h>
 
-
-namespace thrust
-{
+#if THRUST_CPP_DIALECT >= 2011
+#  define THRUST_STD_COMPLEX_REAL(z) \
+    reinterpret_cast< \
+      const typename thrust::detail::remove_reference<decltype(z)>::type::value_type (&)[2] \
+    >(z)[0]
+#  define THRUST_STD_COMPLEX_IMAG(z) \
+    reinterpret_cast< \
+      const typename thrust::detail::remove_reference<decltype(z)>::type::value_type (&)[2] \
+    >(z)[1]
+#  define THRUST_STD_COMPLEX_DEVICE __device__
+#else
+#  define THRUST_STD_COMPLEX_REAL(z) (z).real()
+#  define THRUST_STD_COMPLEX_IMAG(z) (z).imag()
+#  define THRUST_STD_COMPLEX_DEVICE
+#endif
+
+THRUST_NAMESPACE_BEGIN
 
 /*
- *  Calls to the standard math library from inside the thrust namespace 
+ *  Calls to the standard math library from inside the thrust namespace
  *  with real arguments require explicit scope otherwise they will fail
  *  to resolve as it will find the equivalent complex function but then
  *  fail to match the template, and give up looking for other scopes.
  */
 
 
-
 /*! \addtogroup numerics
  *  \{
  */
@@ -49,13 +62,94 @@ namespace thrust
  *  \{
  */
 
-  /*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is functionally
-   *  equivalent to it, but can also be used in device code which <tt>std::complex</tt> currently cannot.
-   *
-   *  \tparam T The type used to hold the real and imaginary parts. Should be <tt>float</tt> 
-   *  or <tt>double</tt>. Others types are not supported.
-   *
-   */
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T, std::size_t Align>
+struct complex_storage;
+
+#if THRUST_CPP_DIALECT >= 2011                                                    \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                       \
+  && (THRUST_GCC_VERSION >= 40800)
+  // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    struct alignas(Align) type { T x; T y; };
+  };
+#elif  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)                    \
+    || (   (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                 \
+        && (THRUST_GCC_VERSION < 40600))
+  // C++03 implementation for MSVC and GCC <= 4.5.
+  //
+  // We have to implement `aligned_type` with specializations for MSVC
+  // and GCC 4.2 and older because they require literals as arguments to
+  // their alignment attribute.
+
+  #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+    // MSVC implementation.
+    #define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X)                   \
+      template <typename T>                                                   \
+      struct complex_storage<T, X>                                            \
+      {                                                                       \
+        __declspec(align(X)) struct type { T x; T y; };                       \
+      };                                                                      \
+      /**/
+  #else
+    // GCC <= 4.2 implementation.
+    #define THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(X)                   \
+      template <typename T>                                                   \
+      struct complex_storage<T, X>                                            \
+      {                                                                       \
+        struct type { T x; T y; } __attribute__((aligned(X)));                \
+      };                                                                      \
+      /**/
+  #endif
+
+  // The primary template is a fallback, which doesn't specify any alignment.
+  // It's only used when T is very large and we're using an older compilers
+  // which we have to fully specialize each alignment case.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    T x; T y;
+  };
+
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(1);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(2);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(4);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(8);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(16);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(32);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(64);
+  THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION(128);
+
+  #undef THRUST_DEFINE_COMPLEX_STORAGE_SPECIALIZATION
+#else
+  // C++03 implementation for GCC > 4.5, Clang, PGI, ICPC, and xlC.
+  template <typename T, std::size_t Align>
+  struct complex_storage
+  {
+    struct type { T x; T y; } __attribute__((aligned(Align)));
+  };
+#endif
+
+} // end namespace detail
+
+/*! \endcond
+ */
+
+/*! \p complex is the Thrust equivalent to <tt>std::complex</tt>. It is
+ *  functionally identical to it, but can also be used in device code which
+ *  <tt>std::complex</tt> currently cannot.
+ *
+ *  \tparam T The type used to hold the real and imaginary parts. Should be
+ *  <tt>float</tt> or <tt>double</tt>. Others types are not supported.
+ *
+ */
 template <typename T>
 struct complex
 {
@@ -65,100 +159,261 @@ struct complex
    */
   typedef T value_type;
 
+
+
   /* --- Constructors --- */
 
+  /*! Construct a complex number with an imaginary part of 0.
+   *
+   *  \param re The real part of the number.
+   */
+  __host__ __device__
+  complex(const T& re);
+
   /*! Construct a complex number from its real and imaginary parts.
    *
    *  \param re The real part of the number.
    *  \param im The imaginary part of the number.
    */
-  inline __host__ __device__      
-  complex(const T & re = T(), const T& im = T());
+  __host__ __device__
+  complex(const T& re, const T& im);
+
+#if THRUST_CPP_DIALECT >= 2011
+  /*! Default construct a complex number.
+   */
+  complex() = default;
+
+  /*! This copy constructor copies from a \p complex with a type that is
+   *  convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  complex(const complex<T>& z) = default;
+#else
+  /*! Default construct a complex number.
+   */
+  __host__ __device__
+  complex();
+
+  /*! This copy constructor copies from a \p complex with a type that is
+   *  convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ __device__
+  complex(const complex<T>& z);
+#endif
+
+  /*! This converting copy constructor copies from a \p complex with a type
+   *  that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex(const complex<U>& z);
+
+  /*! This converting copy constructor copies from a <tt>std::complex</tt> with
+   *  a type that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   */
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex(const std::complex<T>& z);
+
+  /*! This converting copy constructor copies from a <tt>std::complex</tt> with
+   *  a type that is convertible to this \p complex's \c value_type.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex(const std::complex<U>& z);
+
+
+
+  /* --- Assignment Operators --- */
+
+  /*! Assign `re` to the real part of this \p complex and set the imaginary part
+   *  to 0.
+   *
+   *  \param re The real part of the number.
+   */
+  __host__ __device__
+  complex& operator=(const T& re);
 
-  /*! This copy constructor copies from a \p complex with a type that
-   *  is convertible to this \p complex \c value_type.
+#if THRUST_CPP_DIALECT >= 2011
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
    *
    *  \param z The \p complex to copy from.
+   */
+  complex& operator=(const complex<T>& z) = default;
+#else
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
    *
-   *  \tparam X is convertible to \c value_type.
+   *  \param z The \p complex to copy from.
    */
-  template <typename X> 
-  inline __host__ __device__
-  complex(const complex<X> & z);
-  
-  /*! This copy constructor copies from a <tt>std::complex</tt> with a type that
-   *  is convertible to this \p complex \c value_type.
+  __host__ __device__
+  complex& operator=(const complex<T>& z);
+#endif
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
    *
    *  \param z The \p complex to copy from.
    *
-   *  \tparam X is convertible to \c value_type.
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex& operator=(const complex<U>& z);
+
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
    */
-  template <typename X> 
-    inline __host__
-  complex(const std::complex<X> & z);
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex& operator=(const std::complex<T>& z);
 
+  /*! Assign `z.real()` and `z.imag()` to the real and imaginary parts of this
+   *  \p complex respectively.
+   *
+   *  \param z The \p complex to copy from.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ THRUST_STD_COMPLEX_DEVICE
+  complex& operator=(const std::complex<U>& z);
 
 
   /* --- Compound Assignment Operators --- */
 
-  /*! Adds a \p complex to this \p complex and 
-   *  assigns the result to this \p complex.
+  /*! Adds a \p complex to this \p complex and assigns the result to this
+   *  \p complex.
    *
-   *  \param z The \p complex to be Added.
+   *  \param z The \p complex to be added.
+   *
+   *  \tparam U is convertible to \c value_type.
    */
+  template <typename U>
   __host__ __device__
-  inline complex<T>& operator+=(const complex<T> z);
+  complex<T>& operator+=(const complex<U>& z);
 
-  /*! Subtracts a \p complex from this \p complex and 
-   *  assigns the result to this \p complex.
+  /*! Subtracts a \p complex from this \p complex and assigns the result to
+   *  this \p complex.
    *
    *  \param z The \p complex to be subtracted.
+   *
+   *  \tparam U is convertible to \c value_type.
    */
+  template <typename U>
   __host__ __device__
-  inline complex<T>& operator-=(const complex<T> z);
+  complex<T>& operator-=(const complex<U>& z);
 
-  /*! Multiplies this \p complex by another \p complex and 
-   *  assigns the result to this \p complex.
+  /*! Multiplies this \p complex by another \p complex and assigns the result
+   *  to this \p complex.
    *
    *  \param z The \p complex to be multiplied.
+   *
+   *  \tparam U is convertible to \c value_type.
    */
+  template <typename U>
   __host__ __device__
-  inline complex<T>& operator*=(const complex<T> z);
+  complex<T>& operator*=(const complex<U>& z);
 
-  /*! Divides this \p complex by another \p complex and 
-   *  assigns the result to this \p complex.
+  /*! Divides this \p complex by another \p complex and assigns the result to
+   *  this \p complex.
    *
    *  \param z The \p complex to be divided.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator/=(const complex<U>& z);
+
+  /*! Adds a scalar to this \p complex and assigns the result to this
+   *  \p complex.
+   *
+   *  \param z The \p complex to be added.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator+=(const U& z);
+
+  /*! Subtracts a scalar from this \p complex and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The scalar to be subtracted.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator-=(const U& z);
+
+  /*! Multiplies this \p complex by a scalar and assigns the result
+   *  to this \p complex.
+   *
+   *  \param z The scalar to be multiplied.
+   *
+   *  \tparam U is convertible to \c value_type.
+   */
+  template <typename U>
+  __host__ __device__
+  complex<T>& operator*=(const U& z);
+
+  /*! Divides this \p complex by a scalar and assigns the result to
+   *  this \p complex.
+   *
+   *  \param z The scalar to be divided.
+   *
+   *  \tparam U is convertible to \c value_type.
    */
+  template <typename U>
   __host__ __device__
-  inline complex<T>& operator/=(const complex<T> z);
+  complex<T>& operator/=(const U& z);
 
 
 
-  /* --- Getter functions --- 
+  /* --- Getter functions ---
    * The volatile ones are there to help for example
    * with certain reductions optimizations
    */
 
   /*! Returns the real part of this \p complex.
    */
-  __host__ __device__ inline T real() const volatile{ return m_data[0]; }
+  __host__ __device__
+  T real() const volatile { return data.x; }
 
   /*! Returns the imaginary part of this \p complex.
    */
-  __host__ __device__ inline T imag() const volatile{ return m_data[1]; }
+  __host__ __device__
+  T imag() const volatile { return data.y; }
 
   /*! Returns the real part of this \p complex.
    */
-  __host__ __device__ inline T real() const{ return m_data[0]; }
+  __host__ __device__
+  T real() const { return data.x; }
 
   /*! Returns the imaginary part of this \p complex.
    */
-  __host__ __device__ inline T imag() const{ return m_data[1]; }
+  __host__ __device__
+  T imag() const { return data.y; }
 
 
 
-  /* --- Setter functions --- 
+  /* --- Setter functions ---
    * The volatile ones are there to help for example
    * with certain reductions optimizations
    */
@@ -167,25 +422,29 @@ struct complex
    *
    *  \param re The new real part of this \p complex.
    */
-  __host__ __device__ inline void real(T re)volatile{ m_data[0] = re; }
+  __host__ __device__
+  void real(T re) volatile { data.x = re; }
 
   /*! Sets the imaginary part of this \p complex.
    *
    *  \param im The new imaginary part of this \p complex.e
    */
-  __host__ __device__ inline void imag(T im)volatile{ m_data[1] = im; }
+  __host__ __device__
+  void imag(T im) volatile { data.y = im; }
 
   /*! Sets the real part of this \p complex.
    *
    *  \param re The new real part of this \p complex.
    */
-  __host__ __device__ inline void real(T re){ m_data[0] = re; }
+  __host__ __device__
+  void real(T re) { data.x = re; }
 
   /*! Sets the imaginary part of this \p complex.
    *
    *  \param im The new imaginary part of this \p complex.
    */
-  __host__ __device__ inline void imag(T im){ m_data[1] = im; }
+  __host__ __device__
+  void imag(T im) { data.y = im; }
 
 
 
@@ -193,10 +452,11 @@ struct complex
 
   /*! Casts this \p complex to a <tt>std::complex</tt> of the same type.
    */
-  inline operator std::complex<T>() const { return std::complex<T>(real(),imag()); }
+  __host__
+  operator std::complex<T>() const { return std::complex<T>(real(), imag()); }
 
 private:
-  T m_data[2];
+  typename detail::complex_storage<T, sizeof(T) * 2>::type data;
 };
 
 
@@ -206,129 +466,211 @@ struct complex
  *
  *  \param z The \p complex from which to calculate the absolute value.
  */
-template<typename T> __host__ __device__ inline T abs(const complex<T>& z);
+template<typename T>
+__host__ __device__
+T abs(const complex<T>& z);
 
 /*! Returns the phase angle (also known as argument) in radians of a \p complex.
  *
  *  \param z The \p complex from which to calculate the phase angle.
  */
-template<typename T> __host__ __device__ inline T arg(const complex<T>& z);
+template <typename T>
+__host__ __device__
+T arg(const complex<T>& z);
 
 /*! Returns the square of the magnitude of a \p complex.
  *
  *  \param z The \p complex from which to calculate the norm.
  */
-template<typename T> __host__ __device__ inline T norm(const complex<T>& z);
+template <typename T>
+__host__ __device__
+T norm(const complex<T>& z);
 
 /*! Returns the complex conjugate of a \p complex.
  *
  *  \param z The \p complex from which to calculate the complex conjugate.
  */
-template<typename T> __host__ __device__ inline complex<T> conj(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> conj(const complex<T>& z);
 
 /*! Returns a \p complex with the specified magnitude and phase.
  *
  *  \param m The magnitude of the returned \p complex.
  *  \param theta The phase of the returned \p complex in radians.
  */
-template<typename T> __host__ __device__ inline complex<T> polar(const T& m, const T& theta = 0);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+polar(const T0& m, const T1& theta = T1());
 
 /*! Returns the projection of a \p complex on the Riemann sphere.
- *  For all finite \p complex it returns the argument. For \p complexs 
- *  with a non finite part returns (INFINITY,+/-0) where the sign of 
+ *  For all finite \p complex it returns the argument. For \p complexs
+ *  with a non finite part returns (INFINITY,+/-0) where the sign of
  *  the zero matches the sign of the imaginary part of the argument.
  *
  *  \param z The \p complex argument.
  */
-template<typename T> __host__ __device__ inline complex<T> proj(const T& z);
+template <typename T>
+__host__ __device__
+complex<T> proj(const T& z);
 
 
 
 /* --- Binary Arithmetic operators --- */
 
-/*! Multiplies two \p complex numbers.
+/*! Adds two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The first \p complex.
- *  \param rhs The second \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const complex<T1>& y);
 
-/*! Multiplies a \p complex number by a scalar.
+/*! Adds a scalar to a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The \p complex.
- *  \param rhs The scalar.
+ *  \param x The \p complex.
+ *  \param y The scalar.
  */
-template <typename T> __host__ __device__ inline complex<T> operator*(const complex<T>& lhs, const T & rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const T1& y);
 
-/*! Multiplies a scalr by a \p complex number.
+/*! Adds a \p complex number to a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The scalar.
- *  \param rhs The \p complex.
+ *  \param x The scalar.
+ *  \param y The \p complex.
  */
-template <typename T> __host__ __device__ inline complex<T> operator*(const T& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const T0& x, const complex<T1>& y);
 
-/*! Divides two \p complex numbers.
+/*! Subtracts two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The numerator (dividend).
- *  \param rhs The denomimator (divisor).
+ *  \param x The first \p complex (minuend).
+ *  \param y The second \p complex (subtrahend).
  */
-template <typename T> __host__ __device__ inline complex<T> operator/(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const complex<T1>& y);
 
-/*! Divides a \p complex number by a scalar.
+/*! Subtracts a scalar from a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The complex numerator (dividend).
- *  \param rhs The scalar denomimator (divisor).
+ *  \param x The \p complex (minuend).
+ *  \param y The scalar (subtrahend).
  */
-template <typename T> __host__ __device__ inline complex<T> operator/(const complex<T>& lhs, const T & rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const T1& y);
 
-/*! Divides a scalar by a \p complex number.
+/*! Subtracts a \p complex number from a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The scalar numerator (dividend).
- *  \param rhs The complex denomimator (divisor).
+ *  \param x The scalar (minuend).
+ *  \param y The \p complex (subtrahend).
  */
-template <typename T> __host__ __device__ inline complex<T> operator/(const T& lhs, const complex<T> & rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const T0& x, const complex<T1>& y);
 
-/*! Adds two \p complex numbers.
+/*! Multiplies two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The first \p complex.
- *  \param rhs The second \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline complex<T> operator+(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const complex<T1>& y);
 
-/*! Adds a scalar to a \p complex number.
+/*! Multiplies a \p complex number by a scalar.
  *
- *  \param lhs The \p complex.
- *  \param rhs The scalar.
+ *  \param x The \p complex.
+ *  \param y The scalar.
  */
-template <typename T> __host__ __device__ inline complex<T> operator+(const complex<T>& lhs, const T & rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const T1& y);
 
-/*! Adds a \p complex number to a scalar.
+/*! Multiplies a scalar by a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The scalar.
- *  \param rhs The \p complex.
+ *  \param x The scalar.
+ *  \param y The \p complex.
  */
-template <typename T> __host__ __device__ inline complex<T> operator+(const T& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const T0& x, const complex<T1>& y);
 
-/*! Subtracts two \p complex numbers.
+/*! Divides two \p complex numbers.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The first \p complex (minuend).
- *  \param rhs The second \p complex (subtrahend).
+ *  \param x The numerator (dividend).
+ *  \param y The denomimator (divisor).
  */
-template <typename T> __host__ __device__ inline complex<T> operator-(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const complex<T1>& y);
 
-/*! Subtracts a scalar from a \p complex number.
+/*! Divides a \p complex number by a scalar.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The \p complex (minuend).
- *  \param rhs The scalar (subtrahend).
+ *  \param x The complex numerator (dividend).
+ *  \param y The scalar denomimator (divisor).
  */
-template <typename T> __host__ __device__ inline complex<T> operator-(const complex<T>& lhs, const T & rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const T1& y);
 
-/*! Subtracts a \p complex number from a scalar.
+/*! Divides a scalar by a \p complex number.
+ *
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
- *  \param lhs The scalar (minuend).
- *  \param rhs The \p complex (subtrahend).
+ *  \param x The scalar numerator (dividend).
+ *  \param y The complex denomimator (divisor).
  */
-template <typename T> __host__ __device__ inline complex<T> operator-(const T& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const T0& x, const complex<T1>& y);
 
 
 
@@ -336,15 +678,22 @@ template <typename T> __host__ __device__ inline complex<T> operator-(const T& l
 
 /*! Unary plus, returns its \p complex argument.
  *
- *  \param rhs The \p complex argument.
+ *  \param y The \p complex argument.
  */
-template <typename T> __host__ __device__ inline complex<T> operator+(const complex<T>& rhs);
+template <typename T>
+__host__ __device__
+complex<T>
+operator+(const complex<T>& y);
 
-/*! Unary minus, returns the additive inverse (negation) of its \p complex argument.
+/*! Unary minus, returns the additive inverse (negation) of its \p complex
+ * argument.
  *
- *  \param rhs The \p complex argument.
+ *  \param y The \p complex argument.
  */
-template <typename T> __host__ __device__ inline complex<T> operator-(const complex<T>& rhs);
+template <typename T>
+__host__ __device__
+complex<T>
+operator-(const complex<T>& y);
 
 
 
@@ -354,78 +703,76 @@ template <typename T> __host__ __device__ inline complex<T> operator-(const comp
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> exp(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> exp(const complex<T>& z);
 
 /*! Returns the complex natural logarithm of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> log(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> log(const complex<T>& z);
 
 /*! Returns the complex base 10 logarithm of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ inline complex<T> log10(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> log10(const complex<T>& z);
 
 
 
 /* --- Power Functions --- */
 
 /*! Returns a \p complex number raised to another.
+ *
+ *  The value types of the two \p complex types should be compatible and the
+ *  type of the returned \p complex is the promoted type of the two arguments.
  *
  *  \param x The base.
  *  \param y The exponent.
  */
-template <typename T> __host__ __device__ complex<T> pow(const complex<T>& x, const complex<T>& y);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const complex<T1>& y);
 
 /*! Returns a \p complex number raised to a scalar.
  *
- *  \param x The \p complex base.
- *  \param y The scalar exponent.
- */
-template <typename T> __host__ __device__ complex<T> pow(const complex<T>& x, const T& y);
-
-/*! Returns a scalar raised to a \p complex number.
- *
- *  \param x The scalar base.
- *  \param y The \p complex exponent.
- */
-template <typename T> __host__ __device__ complex<T> pow(const T& x, const complex<T>& y);
-
-#if !defined _MSC_VER
-/*! Returns a \p complex number raised to another. The types of the two \p complex should be compatible
- * and the type of the returned \p complex is the promoted type of the two arguments.
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
  *  \param x The base.
  *  \param y The exponent.
  */
-template <typename T, typename U> __host__ __device__ complex<typename detail::promoted_numerical_type<T,U>::type > pow(const complex<T>& x, const complex<U>& y);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const T1& y);
 
-/*! Returns a \p complex number raised to a scalar. The type of the \p complex should be compatible with the scalar
- * and the type of the returned \p complex is the promoted type of the two arguments.
+/*! Returns a scalar raised to a \p complex number.
  *
- *  \param x The base.
- *  \param y The exponent.
- */
-template <typename T, typename U> __host__ __device__ complex<typename detail::promoted_numerical_type<T,U>::type > pow(const complex<T>& x, const U& y);
-
-/*! Returns a scalar raised to a \p complex number. The type of the \p complex should be compatible with the scalar
- * and the type of the returned \p complex is the promoted type of the two arguments.
+ *  The value type of the \p complex should be compatible with the scalar and
+ *  the type of the returned \p complex is the promoted type of the two arguments.
  *
  *  \param x The base.
  *  \param y The exponent.
  */
-template <typename T, typename U> __host__ __device__ complex<typename detail::promoted_numerical_type<T,U>::type > pow(const T& x,const complex<U>& y);
-
-#endif // !defined _MSC_VER
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const T0& x, const complex<T1>& y);
 
 /*! Returns the complex square root of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> sqrt(const complex<T>&z);
-
+template <typename T>
+__host__ __device__
+complex<T> sqrt(const complex<T>& z);
 
 
 /* --- Trigonometric Functions --- */
@@ -434,19 +781,25 @@ template <typename T> __host__ __device__ complex<T> sqrt(const complex<T>&z);
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> cos(const complex<T>&z);
+template <typename T>
+__host__ __device__
+complex<T> cos(const complex<T>& z);
 
 /*! Returns the complex sine of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> sin(const complex<T>&z);
+template <typename T>
+__host__ __device__
+complex<T> sin(const complex<T>& z);
 
 /*! Returns the complex tangent of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> tan(const complex<T>&z);
+template <typename T>
+__host__ __device__
+complex<T> tan(const complex<T>& z);
 
 
 
@@ -456,19 +809,25 @@ template <typename T> __host__ __device__ complex<T> tan(const complex<T>&z);
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> cosh(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> cosh(const complex<T>& z);
 
 /*! Returns the complex hyperbolic sine of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> sinh(const complex<T>&z);
+template <typename T>
+__host__ __device__
+complex<T> sinh(const complex<T>& z);
 
 /*! Returns the complex hyperbolic tangent of a \p complex number.
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> tanh(const complex<T>&z);
+template <typename T>
+__host__ __device__
+complex<T> tanh(const complex<T>& z);
 
 
 
@@ -476,30 +835,36 @@ template <typename T> __host__ __device__ complex<T> tanh(const complex<T>&z);
 
 /*! Returns the complex arc cosine of a \p complex number.
  *
- *  The range of the real part of the result is [0, Pi] and 
+ *  The range of the real part of the result is [0, Pi] and
  *  the range of the imaginary part is [-inf, +inf]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> acos(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> acos(const complex<T>& z);
 
 /*! Returns the complex arc sine of a \p complex number.
  *
- *  The range of the real part of the result is [-Pi/2, Pi/2] and 
+ *  The range of the real part of the result is [-Pi/2, Pi/2] and
  *  the range of the imaginary part is [-inf, +inf]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> asin(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> asin(const complex<T>& z);
 
 /*! Returns the complex arc tangent of a \p complex number.
  *
- *  The range of the real part of the result is [-Pi/2, Pi/2] and 
+ *  The range of the real part of the result is [-Pi/2, Pi/2] and
  *  the range of the imaginary part is [-inf, +inf]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> atan(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> atan(const complex<T>& z);
 
 
 
@@ -507,58 +872,66 @@ template <typename T> __host__ __device__ complex<T> atan(const complex<T>& z);
 
 /*! Returns the complex inverse hyperbolic cosine of a \p complex number.
  *
- *  The range of the real part of the result is [0, +inf] and 
+ *  The range of the real part of the result is [0, +inf] and
  *  the range of the imaginary part is [-Pi, Pi]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> acosh(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> acosh(const complex<T>& z);
 
 /*! Returns the complex inverse hyperbolic sine of a \p complex number.
  *
- *  The range of the real part of the result is [-inf, +inf] and 
+ *  The range of the real part of the result is [-inf, +inf] and
  *  the range of the imaginary part is [-Pi/2, Pi/2]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> asinh(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> asinh(const complex<T>& z);
 
 /*! Returns the complex inverse hyperbolic tangent of a \p complex number.
  *
- *  The range of the real part of the result is [-inf, +inf] and 
+ *  The range of the real part of the result is [-inf, +inf] and
  *  the range of the imaginary part is [-Pi/2, Pi/2]
  *
  *  \param z The \p complex argument.
  */
-template <typename T> __host__ __device__ complex<T> atanh(const complex<T>& z);
+template <typename T>
+__host__ __device__
+complex<T> atanh(const complex<T>& z);
 
 
 
 /* --- Stream Operators --- */
 
-/*! Writes to an output stream a \p complex number in the form (real,imaginary).
+/*! Writes to an output stream a \p complex number in the form (real, imaginary).
  *
  *  \param os The output stream.
  *  \param z The \p complex number to output.
  */
-template<typename ValueType, typename charT, typename traits>
-std::basic_ostream<charT, traits>&
-operator<<(std::basic_ostream<charT, traits>& os, const complex<ValueType>& z);
+template <typename T, typename CharT, typename Traits>
+std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, const complex<T>& z);
 
 /*! Reads a \p complex number from an input stream.
+ *
  *  The recognized formats are:
  * - real
  * - (real)
  * - (real, imaginary)
  *
- * The values read must be convertible to the \p complex's \c value_type 
+ * The values read must be convertible to the \p complex's \c value_type
  *
  *  \param is The input stream.
  *  \param z The \p complex number to set.
  */
-template<typename ValueType, typename charT, typename traits>
-std::basic_istream<charT, traits>&
-operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z);
+template <typename T, typename CharT, typename Traits>
+__host__
+std::basic_istream<CharT, Traits>&
+operator>>(std::basic_istream<CharT, Traits>& is, complex<T>& z);
 
 
 
@@ -566,50 +939,106 @@ operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z);
 
 /*! Returns true if two \p complex numbers are equal and false otherwise.
  *
- *  \param lhs The first \p complex.
- *  \param rhs The second \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator==(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const complex<T1>& y);
 
-/*! Returns true if the imaginary part of the  \p complex number is zero and the real part is equal to the scalar. Returns false otherwise.
+/*! Returns true if two \p complex numbers are equal and false otherwise.
+ *
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
+ */
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const complex<T0>& x, const std::complex<T1>& y);
+
+/*! Returns true if two \p complex numbers are equal and false otherwise.
  *
- *  \param lhs The scalar.
- *  \param rhs The \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator==(const T & lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const std::complex<T0>& x, const complex<T1>& y);
 
-/*! Returns true if the imaginary part of the  \p complex number is zero and the real part is equal to the scalar. Returns false otherwise.
+/*! Returns true if the imaginary part of the \p complex number is zero and
+ *  the real part is equal to the scalar. Returns false otherwise.
  *
- *  \param lhs The \p complex.
- *  \param rhs The scalar.
+ *  \param x The scalar.
+ *  \param y The \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator==(const complex<T> & lhs, const T& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const T0& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is zero and
+ *  the real part is equal to the scalar. Returns false otherwise.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const T1& y);
 
 /*! Returns true if two \p complex numbers are different and false otherwise.
  *
- *  \param lhs The first \p complex.
- *  \param rhs The second \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator!=(const complex<T>& lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const complex<T1>& y);
 
-/*! Returns true if the imaginary part of the  \p complex number is not zero or the real part is different from the scalar. Returns false otherwise.
+/*! Returns true if two \p complex numbers are different and false otherwise.
  *
- *  \param lhs The scalar.
- *  \param rhs The \p complex.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator!=(const T & lhs, const complex<T>& rhs);
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const complex<T0>& x, const std::complex<T1>& y);
 
-/*! Returns true if the imaginary part of the \p complex number is not zero or the real part is different from the scalar. Returns false otherwise.
+/*! Returns true if two \p complex numbers are different and false otherwise.
  *
- *  \param lhs The \p complex.
- *  \param rhs The scalar.
+ *  \param x The first \p complex.
+ *  \param y The second \p complex.
  */
-template <typename T> __host__ __device__ inline bool operator!=(const complex<T> & lhs, const T& rhs);
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const std::complex<T0>& x, const complex<T1>& y);
 
-} // end namespace thrust
+/*! Returns true if the imaginary part of the \p complex number is not zero or
+ *  the real part is different from the scalar. Returns false otherwise.
+ *
+ *  \param x The scalar.
+ *  \param y The \p complex.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const T0& x, const complex<T1>& y);
+
+/*! Returns true if the imaginary part of the \p complex number is not zero or
+ *  the real part is different from the scalar. Returns false otherwise.
+ *
+ *  \param x The \p complex.
+ *  \param y The scalar.
+ */
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const T1& y);
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/complex/complex.inl>
 
+#undef THRUST_STD_COMPLEX_REAL
+#undef THRUST_STD_COMPLEX_IMAG
+#undef THRUST_STD_COMPLEX_DEVICE
+
 /*! \} // complex_numbers
  */
 
diff --git a/thrust/copy.h b/thrust/copy.h
index eb847f41c..99d488174 100644
--- a/thrust/copy.h
+++ b/thrust/copy.h
@@ -15,7 +15,7 @@
  */
 
 
-/*! \file copy.h
+/*! \file thrust/copy.h
  *  \brief Copies elements from one range to another
  */
 
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -54,11 +53,11 @@ namespace thrust
  *  \param last The end of the sequence to copy.
  *  \param result The destination sequence.
  *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
  *
@@ -107,9 +106,9 @@ __host__ __device__
  *  \return The end of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
  *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
  *
@@ -130,7 +129,7 @@ __host__ __device__
  *  // vec1 is now a copy of vec0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy_n
  *  \see thrust::copy
  */
 template<typename DerivedPolicy, typename InputIterator, typename Size, typename OutputIterator>
@@ -157,10 +156,10 @@ __host__ __device__
  *  \param last The end of the sequence to copy.
  *  \param result The destination sequence.
  *  \return The end of the destination sequence.
- *  \see http://www.sgi.com/tech/stl/copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, last)</tt> otherwise.
  *
@@ -202,9 +201,9 @@ template<typename InputIterator, typename OutputIterator>
  *  \param result The beginning destination range.
  *  \return The end of the destination range.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
  *  \tparam Size is an integral type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre \p result may be equal to \p first, but \p result shall not be in the range <tt>[first, first + n)</tt> otherwise.
  *
@@ -224,7 +223,7 @@ template<typename InputIterator, typename OutputIterator>
  *  // vec1 is now a copy of vec0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/copy_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/copy_n
  *  \see thrust::copy
  */
 template<typename InputIterator, typename Size, typename OutputIterator>
@@ -261,10 +260,10 @@ template<typename InputIterator, typename Size, typename OutputIterator>
  *          evaluated to \c true in the range <tt>[first, last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -323,10 +322,10 @@ __host__ __device__
  *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
  *          evaluated to \c true in the range <tt>[first, last)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -388,11 +387,11 @@ template<typename InputIterator,
  *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -455,11 +454,11 @@ __host__ __device__
  *  \return <tt>result + n</tt>, where \c n is equal to the number of times \p pred
  *          evaluated to \c true in the range <tt>[stencil, stencil + (last-first))</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/OutputIterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
  *  \pre The ranges <tt>[stencil, stencil + (last - first))</tt> and <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -505,8 +504,8 @@ template<typename InputIterator1,
 
 /*! \} // end stream_compaction
  */
-	
-} // end namespace thrust
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/copy.h>
 #include <thrust/detail/copy_if.h>
diff --git a/thrust/count.h b/thrust/count.h
index 9225bc6a7..abf8b2d6c 100644
--- a/thrust/count.h
+++ b/thrust/count.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -56,8 +54,8 @@ namespace thrust
  *  \return The number of elements equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
  *
  *  The following code snippet demonstrates how to use \p count to 
  *  count the number of instances in a range of a value of interest using the \p thrust::device execution policy:
@@ -78,7 +76,7 @@ namespace thrust
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
 __host__ __device__
@@ -96,8 +94,8 @@ __host__ __device__
  *  \param value The value to be counted.
  *  \return The number of elements equal to \p value.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam EqualityComparable must be a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be a model of must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam EqualityComparable must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a> and can be compared for equality with \c InputIterator's \c value_type
  *
  *  The following code snippet demonstrates how to use \p count to 
  *  count the number of instances in a range of a value of interest.
@@ -116,7 +114,7 @@ __host__ __device__
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template <typename InputIterator, typename EqualityComparable>
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -136,8 +134,8 @@ template <typename InputIterator, typename EqualityComparable>
  *  \return The number of elements where \p pred is \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p count to
  *  count the number of odd numbers in a range using the \p thrust::device execution policy:
@@ -169,7 +167,7 @@ template <typename InputIterator, typename EqualityComparable>
  *  // result == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
 __host__ __device__
@@ -186,8 +184,8 @@ __host__ __device__
  *  \param pred The predicate.
  *  \return The number of elements where \p pred is \c true.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p count to
  *  count the number of odd numbers in a range.
@@ -217,7 +215,7 @@ __host__ __device__
  *  // result == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/count.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/count
  */
 template <typename InputIterator, typename Predicate>
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -228,8 +226,6 @@ template <typename InputIterator, typename Predicate>
  *  \} // end reductions
  */
 
+THRUST_NAMESPACE_END
 
-} // end thrust
-
-#include <thrust/detail/count.inl>
-
+#include <thrust/detail/count.h>
diff --git a/thrust/detail/adjacent_difference.inl b/thrust/detail/adjacent_difference.inl
index 4593f8d06..844687cff 100644
--- a/thrust/detail/adjacent_difference.inl
+++ b/thrust/detail/adjacent_difference.inl
@@ -14,24 +14,20 @@
  *  limitations under the License.
  */
 
-
-/*! \file adjacent_difference.inl
- *  \brief Inline file for adjacent_difference.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 #include <thrust/system/detail/adl/adjacent_difference.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator>
+__host__ __device__
 OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
+                                   InputIterator first, InputIterator last,
                                    OutputIterator result)
 {
   using thrust::system::detail::generic::adjacent_difference;
@@ -40,10 +36,11 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
 } // end adjacent_difference()
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__
 OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last, 
+                                   InputIterator first, InputIterator last,
                                    OutputIterator result,
                                    BinaryFunction binary_op)
 {
@@ -54,7 +51,7 @@ OutputIterator adjacent_difference(const thrust::detail::execution_policy_base<D
 
 
 template <typename InputIterator, typename OutputIterator>
-OutputIterator adjacent_difference(InputIterator first, InputIterator last, 
+OutputIterator adjacent_difference(InputIterator first, InputIterator last,
                                    OutputIterator result)
 {
   using thrust::system::detail::generic::select_system;
@@ -86,5 +83,4 @@ OutputIterator adjacent_difference(InputIterator first, InputIterator last,
 } // end adjacent_difference()
 
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/advance.inl b/thrust/detail/advance.inl
index b8f10723b..7b5f261bd 100644
--- a/thrust/detail/advance.inl
+++ b/thrust/detail/advance.inl
@@ -14,26 +14,61 @@
  *  limitations under the License.
  */
 
-
-/*! \file advance.inl
- *  \brief Inline file for advance.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/advance.h>
 #include <thrust/system/detail/generic/advance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/has_nested_type.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_difference_type, difference_type)
 
 template <typename InputIterator, typename Distance>
 __host__ __device__
 void advance(InputIterator& i, Distance n)
 {
   thrust::system::detail::generic::advance(i, n);
-} // end advance()
+}
 
+template <typename InputIterator>
+__host__ __device__
+InputIterator next(
+  InputIterator i
+, typename iterator_traits<InputIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, n);
+  return i;
+}
 
-} // end namespace thrust
+template <typename BidirectionalIterator>
+__host__ __device__
+BidirectionalIterator prev(
+  BidirectionalIterator i
+, typename iterator_traits<BidirectionalIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, -n);
+  return i;
+}
+
+template <typename BidirectionalIterator>
+__host__ __device__
+typename detail::disable_if<
+  has_difference_type<iterator_traits<BidirectionalIterator> >::value
+, BidirectionalIterator
+>::type prev(
+  BidirectionalIterator i
+, typename detail::pointer_traits<BidirectionalIterator>::difference_type n = 1
+)
+{
+  thrust::system::detail::generic::advance(i, -n);
+  return i;
+}
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/algorithm_wrapper.h b/thrust/detail/algorithm_wrapper.h
new file mode 100644
index 000000000..c09b9a0a0
--- /dev/null
+++ b/thrust/detail/algorithm_wrapper.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <algorithm>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/alignment.h b/thrust/detail/alignment.h
new file mode 100644
index 000000000..08f73501e
--- /dev/null
+++ b/thrust/detail/alignment.h
@@ -0,0 +1,230 @@
+/*
+ *  Copyright 2017 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file alignment.h
+ *  \brief Type-alignment utilities.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h> // For `integral_constant`.
+
+#include <cstddef> // For `std::size_t` and `std::max_align_t`.
+
+#if THRUST_CPP_DIALECT >= 2011
+    #include <type_traits> // For `std::alignment_of` and `std::aligned_storage`.
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+/// \p THRUST_ALIGNOF is a macro that takes a single type-id as a parameter,
+/// and returns the alignment requirement of the type in bytes.
+/// 
+/// It is an approximation of C++11's `alignof` operator.
+///
+/// Note: MSVC does not allow the builtin used to implement this to be placed
+/// inside of a `__declspec(align(#))` attribute. As a workaround, you can
+/// assign the result of \p THRUST_ALIGNOF to a variable and pass the variable
+/// as the argument to `__declspec(align(#))`.
+#if THRUST_CPP_DIALECT >= 2011
+    #define THRUST_ALIGNOF(x) alignof(x) 
+#else
+    #define THRUST_ALIGNOF(x) __alignof(x)
+#endif
+
+/// \p alignment_of provides the member constant `value` which is equal to the
+/// alignment requirement of the type `T`, as if obtained by a C++11 `alignof`
+/// expression.
+/// 
+/// It is an implementation of C++11's \p std::alignment_of.
+#if THRUST_CPP_DIALECT >= 2011
+    template <typename T>
+    using alignment_of = std::alignment_of<T>;
+#else
+    template <typename T>
+    struct alignment_of;
+
+    template <typename T, std::size_t size_diff>
+    struct alignment_of_helper
+    {
+        static const std::size_t value =
+            integral_constant<std::size_t, size_diff>::value;
+    };
+
+    template <typename T>
+    struct alignment_of_helper<T, 0>
+    {
+        static const std::size_t value = alignment_of<T>::value;
+    };
+
+    template <typename T>
+    struct alignment_of
+    {
+      private:
+        struct impl
+        {
+            T    x;
+            char c;
+        };
+
+      public:
+        static const std::size_t value =
+            alignment_of_helper<impl, sizeof(impl) - sizeof(T)>::value;
+    };
+#endif
+
+/// \p aligned_type provides the nested type `type`, which is a trivial
+/// type whose alignment requirement is a divisor of `Align`.
+///
+/// The behavior is undefined if `Align` is not a power of 2.
+template <std::size_t Align>
+struct aligned_type;
+
+#if THRUST_CPP_DIALECT >= 2011                                                     \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
+  && (THRUST_GCC_VERSION >= 40800)
+    // C++11 implementation, excluding GCC 4.7, which doesn't have `alignas`.
+    template <std::size_t Align>
+    struct aligned_type
+    {
+        struct alignas(Align) type {};
+    };
+#elif  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)                    \
+    || (   (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                 \
+        && (THRUST_GCC_VERSION < 40600))
+    // C++03 implementation for MSVC and GCC <= 4.5.
+    // 
+    // We have to implement `aligned_type` with specializations for MSVC
+    // and GCC 4.2.x and older because they require literals as arguments to 
+    // their alignment attribute.
+
+    #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC)
+        // MSVC implementation.
+        #define THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(X)                  \
+            template <>                                                       \
+            struct aligned_type<X>                                            \
+            {                                                                 \
+                __declspec(align(X)) struct type {};                          \
+            };                                                                \
+            /**/
+    #else
+        // GCC <= 4.2 implementation.
+        #define THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(X)                  \
+            template <>                                                       \
+            struct aligned_type<X>                                            \
+            {                                                                 \
+                struct type {} __attribute__((aligned(X)));                   \
+            };                                                                \
+            /**/
+    #endif
+    
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(1);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(2);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(4);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(8);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(16);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(32);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(64);
+    THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION(128);
+
+    #undef THRUST_DEFINE_ALIGNED_TYPE_SPECIALIZATION
+#else
+    // C++03 implementation for GCC > 4.5, Clang, PGI, ICPC, and xlC.
+    template <std::size_t Align>
+    struct aligned_type
+    {
+        struct type {} __attribute__((aligned(Align)));
+    };
+#endif
+
+/// \p aligned_storage provides the nested type `type`, which is a trivial type
+/// suitable for use as uninitialized storage for any object whose size is at
+/// most `Len` bytes and whose alignment requirement is a divisor of `Align`.
+/// 
+/// The behavior is undefined if `Len` is 0 or `Align` is not a power of 2.
+///
+/// It is an implementation of C++11's \p std::aligned_storage.
+#if THRUST_CPP_DIALECT >= 2011
+    template <std::size_t Len, std::size_t Align>
+    using aligned_storage = std::aligned_storage<Len, Align>;
+#else
+    template <std::size_t Len, std::size_t Align>
+    struct aligned_storage
+    {
+        union type
+        {
+            unsigned char data[Len];
+            // We put this into the union in case the alignment requirement of
+            // an array of `unsigned char` of length `Len` is greater than
+            // `Align`.
+
+            typename aligned_type<Align>::type align;
+        };
+    };
+#endif
+
+/// \p max_align_t is a trivial type whose alignment requirement is at least as
+/// strict (as large) as that of every scalar type.
+///
+/// It is an implementation of C++11's \p std::max_align_t.
+#if THRUST_CPP_DIALECT >= 2011                                                     \
+  && (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                        \
+  && (THRUST_GCC_VERSION >= 40900)
+    // GCC 4.7 and 4.8 don't have `std::max_align_t`.
+    using max_align_t = std::max_align_t;
+#else
+    union max_align_t
+    {
+        // These cannot be private because C++03 POD types cannot have private
+        // data members.
+        char c;
+        short s;
+        int i;
+        long l;
+        float f;
+        double d;
+        long long ll;
+        long double ld;
+        void* p;
+    };
+#endif
+
+/// \p aligned_reinterpret_cast `reinterpret_cast`s \p u of type \p U to `void*`
+/// and then `reinterpret_cast`s the result to \p T. The indirection through
+/// `void*` suppresses compiler warnings when the alignment requirement of \p *u
+/// is less than the alignment requirement of \p *t. The caller of
+/// \p aligned_reinterpret_cast is responsible for ensuring that the alignment
+/// requirements are actually satisified.
+template <typename T, typename U>
+__host__ __device__
+T aligned_reinterpret_cast(U u)
+{
+  return reinterpret_cast<T>(reinterpret_cast<void*>(u));
+}
+
+__host__ __device__
+inline std::size_t aligned_storage_size(std::size_t n, std::size_t align)
+{
+  return ((n + align - 1) / align) * align;
+}
+
+} // end namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/allocator/allocator_traits.h b/thrust/detail/allocator/allocator_traits.h
index bc5de156c..3a5af3661 100644
--- a/thrust/detail/allocator/allocator_traits.h
+++ b/thrust/detail/allocator/allocator_traits.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  *  limitations under the License.
  */
 
+// allocator_traits::rebind_alloc and allocator::rebind_traits are from libc++,
+// dual licensed under the MIT and the University of Illinois Open Source
+// Licenses.
+
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -22,8 +26,9 @@
 #include <thrust/detail/type_traits/has_member_function.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+#include <thrust/detail/memory_wrapper.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -48,8 +53,43 @@ __THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_copy_assignment, prop
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_move_assignment, propagate_on_container_move_assignment)
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_propagate_on_container_swap, propagate_on_container_swap)
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_system_type, system_type)
+__THRUST_DEFINE_HAS_NESTED_TYPE(has_is_always_equal, is_always_equal)
 __THRUST_DEFINE_HAS_MEMBER_FUNCTION(has_member_system_impl, system)
 
+template<typename Alloc, typename U>
+  struct has_rebind
+{
+  typedef char yes_type;
+  typedef int  no_type;
+
+  template<typename S>
+  static yes_type test(typename S::template rebind<U>::other*);
+  template<typename S>
+  static no_type  test(...);
+
+  static bool const value = sizeof(test<U>(0)) == sizeof(yes_type);
+
+  typedef thrust::detail::integral_constant<bool, value> type;
+};
+
+// The following fields of std::allocator have been deprecated (since C++17).
+// There's no way to detect it other than explicit specialization.
+#if THRUST_CPP_DIALECT >= 2017
+#define THRUST_SPECIALIZE_DEPRECATED(trait_name)                               \
+template <typename T>                                                          \
+struct trait_name<std::allocator<T>> : false_type {};
+
+THRUST_SPECIALIZE_DEPRECATED(has_is_always_equal)
+THRUST_SPECIALIZE_DEPRECATED(has_pointer)
+THRUST_SPECIALIZE_DEPRECATED(has_const_pointer)
+THRUST_SPECIALIZE_DEPRECATED(has_reference)
+THRUST_SPECIALIZE_DEPRECATED(has_const_reference)
+
+#undef THRUST_SPECIALIZE_DEPRECATED
+
+template<typename T, typename U>
+struct has_rebind<std::allocator<T>, U> : false_type {};
+#endif
 
 template<typename T>
   struct nested_pointer
@@ -117,6 +157,12 @@ template<typename T>
   typedef typename T::propagate_on_container_swap type;
 };
 
+template<typename T>
+  struct nested_is_always_equal
+{
+  typedef typename T::is_always_equal type;
+};
+
 template<typename T>
   struct nested_system_type
 {
@@ -124,15 +170,89 @@ template<typename T>
 };
 
 template<typename Alloc>
-  class has_member_system
+  struct has_member_system
 {
   typedef typename allocator_system<Alloc>::type system_type;
 
-  public:
-    typedef typename has_member_system_impl<Alloc, system_type&(void)>::type type;
-    static const bool value = type::value;
+  typedef typename has_member_system_impl<Alloc, system_type&(void)>::type type;
+  static const bool value = type::value;
+};
+
+template<class Alloc, class U, bool = has_rebind<Alloc, U>::value>
+  struct rebind_alloc
+{
+    typedef typename Alloc::template rebind<U>::other type;
+};
+
+#if THRUST_CPP_DIALECT >= 2011
+template<template<typename, typename...> class Alloc,
+         typename T, typename... Args, typename U>
+  struct rebind_alloc<Alloc<T, Args...>, U, true>
+{
+    typedef typename Alloc<T, Args...>::template rebind<U>::other type;
+};
+
+template<template<typename, typename...> class Alloc,
+         typename T, typename... Args, typename U>
+  struct rebind_alloc<Alloc<T, Args...>, U, false>
+{
+    typedef Alloc<U, Args...> type;
+};
+#else // C++03
+template <template <typename> class Alloc, typename T, typename U>
+  struct rebind_alloc<Alloc<T>, U, true>
+{
+    typedef typename Alloc<T>::template rebind<U>::other type;
+};
+
+template <template <typename> class Alloc, typename T, typename U>
+  struct rebind_alloc<Alloc<T>, U, false>
+{
+    typedef Alloc<U> type;
+};
+
+template<template<typename, typename> class Alloc,
+         typename T, typename A0, typename U>
+  struct rebind_alloc<Alloc<T, A0>, U, true>
+{
+    typedef typename Alloc<T, A0>::template rebind<U>::other type;
+};
+
+template<template<typename, typename> class Alloc,
+         typename T, typename A0, typename U>
+  struct rebind_alloc<Alloc<T, A0>, U, false>
+{
+    typedef Alloc<U, A0> type;
+};
+
+template<template<typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1>, U, true>
+{
+    typedef typename Alloc<T, A0, A1>::template rebind<U>::other type;
 };
 
+template<template<typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1>, U, false>
+{
+    typedef Alloc<U, A0, A1> type;
+};
+
+template<template<typename, typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename A2, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1, A2>, U, true>
+{
+    typedef typename Alloc<T, A0, A1, A2>::template rebind<U>::other type;
+};
+
+template<template<typename, typename, typename, typename> class Alloc,
+         typename T, typename A0, typename A1, typename A2, typename U>
+  struct rebind_alloc<Alloc<T, A0, A1, A2>, U, false>
+{
+    typedef Alloc<U, A0, A1, A2> type;
+};
+#endif
 
 } // end allocator_traits_detail
 
@@ -207,6 +327,12 @@ template<typename Alloc>
     identity_<false_type>
   >::type propagate_on_container_swap;
 
+  typedef typename eval_if<
+    allocator_traits_detail::has_is_always_equal<allocator_type>::value,
+    allocator_traits_detail::nested_is_always_equal<allocator_type>,
+    is_empty<allocator_type>
+  >::type is_always_equal;
+
   typedef typename eval_if<
     allocator_traits_detail::has_system_type<allocator_type>::value,
     allocator_traits_detail::nested_system_type<allocator_type>,
@@ -216,6 +342,35 @@ template<typename Alloc>
   // XXX rebind and rebind_traits are alias templates
   //     and so are omitted while c++11 is unavailable
 
+#if THRUST_CPP_DIALECT >= 2011
+  template <typename U>
+  using rebind_alloc =
+    typename allocator_traits_detail::rebind_alloc<allocator_type, U>::type;
+
+  template <typename U>
+  using rebind_traits = allocator_traits<rebind_alloc<U>>;
+
+  // We define this nested type alias for compatibility with the C++03-style
+  // rebind_* mechanisms.
+  using other = allocator_traits;
+#else
+  template <typename U>
+  struct rebind_alloc
+  {
+    typedef typename
+      allocator_traits_detail::rebind_alloc<allocator_type, U>::type other;
+  };
+  template <typename U>
+  struct rebind_traits
+  {
+    typedef allocator_traits<typename rebind_alloc<U>::other> other;
+  };
+#endif
+
+  // Deprecated std::allocator typedefs that we need:
+  typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
+  typedef typename thrust::detail::pointer_traits<const_pointer>::reference const_reference;
+
   inline __host__ __device__
   static pointer allocate(allocator_type &a, size_type n);
 
@@ -233,6 +388,11 @@ template<typename Alloc>
   template<typename T, typename Arg1>
   inline __host__ __device__ static void construct(allocator_type &a, T *p, const Arg1 &arg1);
 
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename... Args>
+  inline __host__ __device__ static void construct(allocator_type &a, T *p, Args&&... args);
+#endif
+
   template<typename T>
   inline __host__ __device__ static void destroy(allocator_type &a, T *p);
 
@@ -276,7 +436,7 @@ template<typename Alloc>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/allocator_traits.inl>
 
diff --git a/thrust/detail/allocator/allocator_traits.inl b/thrust/detail/allocator/allocator_traits.inl
index 689fc18e7..275330094 100644
--- a/thrust/detail/allocator/allocator_traits.inl
+++ b/thrust/detail/allocator/allocator_traits.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,16 +14,121 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/is_call_possible.h>
 #include <thrust/detail/integer_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/type_deduction.h>
+#endif
+
+#include <thrust/detail/memory_wrapper.h>
 #include <new>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
+
+#if THRUST_CPP_DIALECT >= 2011
+
+// std::allocator's member functions are deprecated in C++17 and removed in
+// C++20, so we can't just use the generic implementation for allocator_traits
+// that calls the allocator's member functions.
+// Instead, specialize allocator_traits for std::allocator and defer to
+// std::allocator_traits<std::allocator> and let the STL do whatever it needs
+// to for the current c++ version. Manually forward the calls to suppress
+// host/device warnings.
+template <typename T>
+struct allocator_traits<std::allocator<T>>
+  : public std::allocator_traits<std::allocator<T>>
+{
+private:
+  using superclass = std::allocator_traits<std::allocator<T>>;
+
+public:
+  using allocator_type = typename superclass::allocator_type;
+  using value_type = typename superclass::value_type;
+  using pointer = typename superclass::pointer;
+  using const_pointer = typename superclass::const_pointer;
+  using void_pointer = typename superclass::void_pointer;
+  using const_void_pointer = typename superclass::const_void_pointer;
+  using difference_type = typename superclass::difference_type;
+  using size_type = typename superclass::size_type;
+  using propagate_on_container_swap = typename superclass::propagate_on_container_swap;
+  using propagate_on_container_copy_assignment =
+    typename superclass::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment =
+    typename superclass::propagate_on_container_move_assignment;
+
+  // std::allocator_traits added this in C++17, but thrust::allocator_traits defines
+  // it unconditionally.
+  using is_always_equal = typename eval_if<
+      allocator_traits_detail::has_is_always_equal<allocator_type>::value,
+      allocator_traits_detail::nested_is_always_equal<allocator_type>,
+      is_empty<allocator_type>
+    >::type;
+
+  // std::allocator_traits doesn't provide these, but
+  // thrust::detail::allocator_traits does. These used to be part of the
+  // std::allocator API but were deprecated in C++17.
+  using reference = typename thrust::detail::pointer_traits<pointer>::reference;
+  using const_reference = typename thrust::detail::pointer_traits<const_pointer>::reference;
+
+  template <typename U>
+  using rebind_alloc = std::allocator<U>;
+  template <typename U>
+  using rebind_traits = allocator_traits<std::allocator<U>>;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n)
+  {
+    return superclass::allocate(a, n);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static pointer allocate(allocator_type &a, size_type n, const_void_pointer hint)
+  {
+    return superclass::allocate(a, n, hint);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static void deallocate(allocator_type &a, pointer p, size_type n)
+  {
+    superclass::deallocate(a, p, n);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U, typename ...Args>
+  __host__ __device__
+  static void construct(allocator_type &a, U *p, Args&&... args)
+  {
+    superclass::construct(a, p, THRUST_FWD(args)...);
+  }
+
+  __thrust_exec_check_disable__
+  template <typename U>
+  __host__ __device__
+  static void destroy(allocator_type &a, U *p)
+  {
+    superclass::destroy(a, p);
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  static size_type max_size(const allocator_type &a)
+  {
+    return superclass::max_size(a);
+  }
+};
+
+#endif //  C++11
+
 namespace allocator_traits_detail
 {
 
@@ -82,12 +187,13 @@ template<typename Alloc, typename T>
   a.construct(p);
 }
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T>
   inline __host__ __device__
     typename disable_if<
       has_member_construct1<Alloc,T>::value
     >::type
-      construct(Alloc &a, T *p)
+      construct(Alloc &, T *p)
 {
   ::new(static_cast<void*>(p)) T();
 }
@@ -100,6 +206,7 @@ template<typename Alloc, typename T, typename Arg1>
     : has_member_construct2_impl<Alloc, void(T*,const Arg1 &)>
 {};
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T, typename Arg1>
   inline __host__ __device__
     typename enable_if<
@@ -110,6 +217,7 @@ template<typename Alloc, typename T, typename Arg1>
   a.construct(p,arg1);
 }
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T, typename Arg1>
   inline __host__ __device__
     typename disable_if<
@@ -120,6 +228,38 @@ template<typename Alloc, typename T, typename Arg1>
   ::new(static_cast<void*>(p)) T(arg1);
 }
 
+#if THRUST_CPP_DIALECT >= 2011
+
+__THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_constructN_impl, construct)
+
+template<typename Alloc, typename T, typename... Args>
+  struct has_member_constructN
+    : has_member_constructN_impl<Alloc, void(T*, Args...)>
+{};
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename... Args>
+  inline __host__ __device__
+    typename enable_if<
+      has_member_constructN<Alloc, T, Args...>::value
+    >::type
+      construct(Alloc &a, T* p, Args&&... args)
+{
+  a.construct(p, THRUST_FWD(args)...);
+}
+
+__thrust_exec_check_disable__
+template<typename Alloc, typename T, typename... Args>
+  inline __host__ __device__
+    typename disable_if<
+      has_member_constructN<Alloc, T, Args...>::value
+    >::type
+      construct(Alloc &, T* p, Args&&... args)
+{
+  ::new(static_cast<void*>(p)) T(THRUST_FWD(args)...);
+}
+
+#endif
 
 __THRUST_DEFINE_IS_CALL_POSSIBLE(has_member_destroy_impl, destroy)
 
@@ -128,6 +268,7 @@ template<typename Alloc, typename T>
     : has_member_destroy_impl<Alloc, void(T*)>
 {};
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T>
   inline __host__ __device__
     typename enable_if<
@@ -138,6 +279,7 @@ template<typename Alloc, typename T>
   a.destroy(p);
 }
 
+__thrust_exec_check_disable__
 template<typename Alloc, typename T>
   inline __host__ __device__
     typename disable_if<
@@ -178,7 +320,7 @@ __host__ __device__
     has_member_max_size<Alloc>::value,
     typename allocator_traits<Alloc>::size_type
   >::type
-    max_size(const Alloc &a)
+    max_size(const Alloc &)
 {
   typedef typename allocator_traits<Alloc>::size_type size_type;
   return thrust::detail::integer_traits<size_type>::const_max;
@@ -202,11 +344,10 @@ __host__ __device__
     has_member_system<Alloc>::value,
     typename allocator_system<Alloc>::type
   >::type
-    system(Alloc &a)
+    system(Alloc &)
 {
-  // return a copy of a default-constructed system
-  typename allocator_system<Alloc>::type result;
-  return result;
+  // return a copy of a value-initialized system
+  return typename allocator_system<Alloc>::type();
 }
 
 
@@ -222,7 +363,7 @@ __host__ __device__
   struct workaround_warnings
   {
     __thrust_exec_check_disable__
-    static __host__ __device__ 
+    static __host__ __device__
     typename allocator_traits<Alloc>::pointer
       allocate(Alloc &a, typename allocator_traits<Alloc>::size_type n)
     {
@@ -278,6 +419,19 @@ template<typename Alloc>
   return allocator_traits_detail::construct(a,p,arg1);
 }
 
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename Alloc>
+  template<typename T, typename... Args>
+  __host__ __device__
+    void allocator_traits<Alloc>
+      ::construct(allocator_type &a, T *p, Args&&... args)
+{
+  return allocator_traits_detail::construct(a, p, THRUST_FWD(args)...);
+}
+
+#endif
+
 template<typename Alloc>
   template<typename T>
   __host__ __device__
@@ -307,5 +461,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/copy_construct_range.h b/thrust/detail/allocator/copy_construct_range.h
index 491c8ef41..b3c2de324 100644
--- a/thrust/detail/allocator/copy_construct_range.h
+++ b/thrust/detail/allocator/copy_construct_range.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -41,7 +40,7 @@ __host__ __device__
                                  Pointer result);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/copy_construct_range.inl>
 
diff --git a/thrust/detail/allocator/copy_construct_range.inl b/thrust/detail/allocator/copy_construct_range.inl
index d2eb281c5..a71cca1f7 100644
--- a/thrust/detail/allocator/copy_construct_range.inl
+++ b/thrust/detail/allocator/copy_construct_range.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
@@ -24,10 +26,9 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/for_each.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -92,7 +93,7 @@ __host__ __device__
     Pointer
   >::type
     uninitialized_copy_with_allocator(Allocator &a,
-                                      const thrust::execution_policy<FromSystem> &from_system,
+                                      const thrust::execution_policy<FromSystem> &,
                                       const thrust::execution_policy<ToSystem> &to_system,
                                       InputIterator first,
                                       InputIterator last,
@@ -134,7 +135,7 @@ __host__ __device__
     Pointer
   >::type
     uninitialized_copy_with_allocator_n(Allocator &a,
-                                        const thrust::execution_policy<FromSystem> &from_system,
+                                        const thrust::execution_policy<FromSystem> &,
                                         const thrust::execution_policy<ToSystem> &to_system,
                                         InputIterator first,
                                         Size n,
@@ -305,5 +306,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/default_construct_range.h b/thrust/detail/allocator/default_construct_range.h
index 6c3856c14..8b5026c05 100644
--- a/thrust/detail/allocator/default_construct_range.h
+++ b/thrust/detail/allocator/default_construct_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -30,7 +29,7 @@ inline void default_construct_range(Allocator &a, Pointer p, Size n);
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/default_construct_range.inl>
 
diff --git a/thrust/detail/allocator/default_construct_range.inl b/thrust/detail/allocator/default_construct_range.inl
index 0f65d4806..6d26578fa 100644
--- a/thrust/detail/allocator/default_construct_range.inl
+++ b/thrust/detail/allocator/default_construct_range.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/for_each.h>
 #include <thrust/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -58,7 +59,7 @@ template<typename Allocator, typename T>
 {};
 
 
-// we know that std::allocator::construct's only effect is to call T's 
+// we know that std::allocator::construct's only effect is to call T's
 // default constructor, so we needn't use it for default construction
 // unless T's constructor does something interesting
 template<typename U, typename T>
@@ -107,5 +108,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/destroy_range.h b/thrust/detail/allocator/destroy_range.h
index bf00037ce..cfc7e3f6e 100644
--- a/thrust/detail/allocator/destroy_range.h
+++ b/thrust/detail/allocator/destroy_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -28,7 +27,7 @@ __host__ __device__
   inline void destroy_range(Allocator &a, Pointer p, Size n);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/destroy_range.inl>
 
diff --git a/thrust/detail/allocator/destroy_range.inl b/thrust/detail/allocator/destroy_range.inl
index d64745766..662177f3a 100644
--- a/thrust/detail/allocator/destroy_range.inl
+++ b/thrust/detail/allocator/destroy_range.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,14 +14,17 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/for_each.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -160,5 +163,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/fill_construct_range.h b/thrust/detail/allocator/fill_construct_range.h
index 9de0f7bcb..a7572cb2d 100644
--- a/thrust/detail/allocator/fill_construct_range.h
+++ b/thrust/detail/allocator/fill_construct_range.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -30,7 +29,7 @@ inline void fill_construct_range(Allocator &a, Pointer p, Size n, const T &value
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/fill_construct_range.inl>
 
diff --git a/thrust/detail/allocator/fill_construct_range.inl b/thrust/detail/allocator/fill_construct_range.inl
index 2f966703f..876b5ddd2 100644
--- a/thrust/detail/allocator/fill_construct_range.inl
+++ b/thrust/detail/allocator/fill_construct_range.inl
@@ -14,16 +14,17 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/allocator/allocator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/for_each.h>
 #include <thrust/uninitialized_fill.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace allocator_traits_detail
@@ -109,5 +110,5 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/malloc_allocator.h b/thrust/detail/allocator/malloc_allocator.h
index 2c01c66bd..af3d0fccb 100644
--- a/thrust/detail/allocator/malloc_allocator.h
+++ b/thrust/detail/allocator/malloc_allocator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/allocator/tagged_allocator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -46,7 +45,7 @@ template<typename T, typename System, typename Pointer>
 };
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/malloc_allocator.inl>
 
diff --git a/thrust/detail/allocator/malloc_allocator.inl b/thrust/detail/allocator/malloc_allocator.inl
index 6dbb98d22..d03d33305 100644
--- a/thrust/detail/allocator/malloc_allocator.inl
+++ b/thrust/detail/allocator/malloc_allocator.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/malloc_allocator.h>
 #include <thrust/system/detail/generic/select_system.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -50,7 +51,7 @@ template<typename T, typename System, typename Pointer>
 
 template<typename T, typename System, typename Pointer>
   void malloc_allocator<T,System,Pointer>
-    ::deallocate(typename malloc_allocator<T,System,Pointer>::pointer p, typename malloc_allocator<T,System,Pointer>::size_type n)
+    ::deallocate(typename malloc_allocator<T,System,Pointer>::pointer p, typename malloc_allocator<T,System,Pointer>::size_type)
 {
   using thrust::system::detail::generic::select_system;
 
@@ -60,5 +61,5 @@ template<typename T, typename System, typename Pointer>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/no_throw_allocator.h b/thrust/detail/allocator/no_throw_allocator.h
index ba8c3d852..a6c16985b 100644
--- a/thrust/detail/allocator/no_throw_allocator.h
+++ b/thrust/detail/allocator/no_throw_allocator.h
@@ -18,8 +18,9 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -44,18 +45,18 @@ template<typename BaseAllocator>
     __host__ __device__
     void deallocate(typename super_t::pointer p, typename super_t::size_type n)
     {
-#ifndef __CUDA_ARCH__
-      try
-      {
+      NV_IF_TARGET(NV_IS_HOST, (
+        try
+        {
+          super_t::deallocate(p, n);
+        } // end try
+        catch(...)
+        {
+          // catch anything
+        } // end catch
+      ), (
         super_t::deallocate(p, n);
-      } // end try
-      catch(...)
-      {
-        // catch anything
-      } // end catch
-#else
-      super_t::deallocate(p, n);
-#endif
+      ));
     } // end deallocate()
 
     inline __host__ __device__
@@ -66,6 +67,6 @@ template<typename BaseAllocator>
 }; // end no_throw_allocator
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/allocator/tagged_allocator.h b/thrust/detail/allocator/tagged_allocator.h
index a29115c6c..804c4e42e 100644
--- a/thrust/detail/allocator/tagged_allocator.h
+++ b/thrust/detail/allocator/tagged_allocator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -95,7 +94,7 @@ __host__ __device__
 bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocator<T2,Pointer2,Tag> &);
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/tagged_allocator.inl>
 
diff --git a/thrust/detail/allocator/tagged_allocator.inl b/thrust/detail/allocator/tagged_allocator.inl
index da1d44457..bcd534cbc 100644
--- a/thrust/detail/allocator/tagged_allocator.inl
+++ b/thrust/detail/allocator/tagged_allocator.inl
@@ -14,23 +14,26 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/tagged_allocator.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
 
 template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
   tagged_allocator<T,Tag,Pointer>
     ::tagged_allocator()
 {}
 
 
 template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
   tagged_allocator<T,Tag,Pointer>
     ::tagged_allocator(const tagged_allocator<T,Tag,Pointer> &)
 {}
@@ -38,18 +41,21 @@ template<typename T, typename Tag, typename Pointer>
 
 template<typename T, typename Tag, typename Pointer>
   template<typename U, typename OtherPointer>
+    __host__ __device__
     tagged_allocator<T,Tag,Pointer>
       ::tagged_allocator(const tagged_allocator<U,Tag,OtherPointer> &)
 {}
 
 
 template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
   tagged_allocator<T,Tag,Pointer>
     ::~tagged_allocator()
 {}
 
 
 template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
   typename tagged_allocator<T,Tag,Pointer>::pointer
     tagged_allocator<T,Tag,Pointer>
       ::address(reference x) const
@@ -59,6 +65,7 @@ template<typename T, typename Tag, typename Pointer>
 
 
 template<typename T, typename Tag, typename Pointer>
+  __host__ __device__
   typename tagged_allocator<T,Tag,Pointer>::const_pointer
     tagged_allocator<T,Tag,Pointer>
       ::address(const_reference x) const
@@ -90,8 +97,8 @@ bool operator!=(const tagged_allocator<T1,Pointer1,Tag> &, const tagged_allocato
 {
   return false;
 }
-    
+
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator/temporary_allocator.h b/thrust/detail/allocator/temporary_allocator.h
index 4d2ac429c..c8ef60625 100644
--- a/thrust/detail/allocator/temporary_allocator.h
+++ b/thrust/detail/allocator/temporary_allocator.h
@@ -23,8 +23,7 @@
 #include <thrust/memory.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -79,7 +78,7 @@ template<typename T, typename System>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/allocator/temporary_allocator.inl>
 
diff --git a/thrust/detail/allocator/temporary_allocator.inl b/thrust/detail/allocator/temporary_allocator.inl
index 97e81d667..ef5d1afa5 100644
--- a/thrust/detail/allocator/temporary_allocator.inl
+++ b/thrust/detail/allocator/temporary_allocator.inl
@@ -14,18 +14,23 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/allocator/temporary_allocator.h>
 #include <thrust/detail/temporary_buffer.h>
 #include <thrust/system/detail/bad_alloc.h>
 #include <cassert>
 
-#ifdef __NVCC__
+#include <nv/target>
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+#if (defined(_NVHPC_CUDA) || defined(__CUDA_ARCH__))
 #include <thrust/system/cuda/detail/terminate.h>
-#endif
+#endif // NVCC device pass or NVC++
+#endif // CUDA
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -45,10 +50,14 @@ __host__ __device__
     // note that we pass cnt to deallocate, not a value derived from result.second
     deallocate(result.first, cnt);
 
-#if !defined(__CUDA_ARCH__)
-    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
+    ), ( // NV_IS_DEVICE
+      thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+    ));
 #else
-    thrust::system::cuda::detail::terminate_with_message("temporary_buffer::allocate: get_temporary_buffer failed");
+    throw thrust::system::detail::bad_alloc("temporary_buffer::allocate: get_temporary_buffer failed");
 #endif
   } // end if
 
@@ -61,10 +70,10 @@ __host__ __device__
   void temporary_allocator<T,System>
     ::deallocate(typename temporary_allocator<T,System>::pointer p, typename temporary_allocator<T,System>::size_type n)
 {
-  return thrust::return_temporary_buffer(system(), p);
+  return thrust::return_temporary_buffer(system(), p, n);
 } // end temporary_allocator
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/allocator_aware_execution_policy.h b/thrust/detail/allocator_aware_execution_policy.h
new file mode 100644
index 000000000..eea93c035
--- /dev/null
+++ b/thrust/detail/allocator_aware_execution_policy.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execute_with_allocator_fwd.h>
+#include <thrust/detail/alignment.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <type_traits>
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+namespace mr
+{
+
+template<typename T, class MR>
+class allocator;
+
+}
+
+namespace detail
+{
+
+template<template <typename> class ExecutionPolicyCRTPBase>
+struct allocator_aware_execution_policy
+{
+  template<typename MemoryResource>
+  struct execute_with_memory_resource_type
+  {
+    typedef thrust::detail::execute_with_allocator<
+      thrust::mr::allocator<
+        thrust::detail::max_align_t,
+        MemoryResource
+      >,
+      ExecutionPolicyCRTPBase
+    > type;
+  };
+
+  template<typename Allocator>
+  struct execute_with_allocator_type
+  {
+      typedef thrust::detail::execute_with_allocator<
+        Allocator,
+        ExecutionPolicyCRTPBase
+      > type;
+  };
+
+  template<typename MemoryResource>
+    typename execute_with_memory_resource_type<MemoryResource>::type
+      operator()(MemoryResource * mem_res) const
+  {
+    return typename execute_with_memory_resource_type<MemoryResource>::type(mem_res);
+  }
+
+  template<typename Allocator>
+    typename execute_with_allocator_type<Allocator&>::type
+      operator()(Allocator &alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator&>::type(alloc);
+  }
+
+  template<typename Allocator>
+    typename execute_with_allocator_type<Allocator>::type
+      operator()(const Allocator &alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator>::type(alloc);
+  }
+
+#if THRUST_CPP_DIALECT >= 2011
+  // just the rvalue overload
+  // perfect forwarding doesn't help, because a const reference has to be turned
+  // into a value by copying for the purpose of storing it in execute_with_allocator
+  template<typename Allocator,
+      typename std::enable_if<!std::is_lvalue_reference<Allocator>::value>::type * = nullptr>
+    typename execute_with_allocator_type<Allocator>::type
+      operator()(Allocator &&alloc) const
+  {
+    return typename execute_with_allocator_type<Allocator>::type(std::move(alloc));
+  }
+#endif
+};
+
+} // end namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/binary_search.inl b/thrust/detail/binary_search.inl
index 5703226dc..90350ced4 100644
--- a/thrust/detail/binary_search.inl
+++ b/thrust/detail/binary_search.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/binary_search.h>
@@ -26,11 +23,9 @@
 #include <thrust/system/detail/generic/binary_search.h>
 #include <thrust/system/detail/adl/binary_search.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -43,7 +38,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -57,7 +52,7 @@ ForwardIterator lower_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -70,7 +65,7 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -84,11 +79,11 @@ ForwardIterator upper_bound(const thrust::detail::execution_policy_base<DerivedP
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                   ForwardIterator first, 
+                   ForwardIterator first,
                    ForwardIterator last,
                    const LessThanComparable& value)
 {
@@ -97,13 +92,13 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
                    ForwardIterator first,
                    ForwardIterator last,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::binary_search;
@@ -111,7 +106,7 @@ bool binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &e
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering>
 __host__ __device__
 thrust::pair<ForwardIterator, ForwardIterator>
@@ -126,7 +121,7 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename LessThanComparable>
 __host__ __device__
 thrust::pair<ForwardIterator, ForwardIterator>
@@ -140,13 +135,13 @@ equal_range(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -155,13 +150,13 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -171,13 +166,13 @@ OutputIterator lower_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -186,13 +181,13 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                           ForwardIterator first, 
+                           ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -202,13 +197,13 @@ OutputIterator upper_bound(const thrust::detail::execution_policy_base<DerivedPo
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
+                             ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output)
 {
@@ -217,13 +212,13 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
 }
 
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template <typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator binary_search(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                             ForwardIterator first, 
+                             ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -238,13 +233,13 @@ OutputIterator binary_search(const thrust::detail::execution_policy_base<Derived
 //////////////////////
 
 template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator lower_bound(ForwardIterator first, 
+ForwardIterator lower_bound(ForwardIterator first,
                             ForwardIterator last,
                             const LessThanComparable& value)
 {
     using thrust::system::detail::generic::select_system;
 
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
     System system;
 
@@ -254,12 +249,12 @@ ForwardIterator lower_bound(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 ForwardIterator lower_bound(ForwardIterator first,
                             ForwardIterator last,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
 
-    typedef typename thrust::iterator_system<ForwardIterator>::type System; 
+    typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
     System system;
 
@@ -267,7 +262,7 @@ ForwardIterator lower_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename LessThanComparable>
-ForwardIterator upper_bound(ForwardIterator first, 
+ForwardIterator upper_bound(ForwardIterator first,
                             ForwardIterator last,
                             const LessThanComparable& value)
 {
@@ -283,7 +278,7 @@ ForwardIterator upper_bound(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 ForwardIterator upper_bound(ForwardIterator first,
                             ForwardIterator last,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
@@ -296,7 +291,7 @@ ForwardIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename LessThanComparable>
-bool binary_search(ForwardIterator first, 
+bool binary_search(ForwardIterator first,
                    ForwardIterator last,
                    const LessThanComparable& value)
 {
@@ -312,7 +307,7 @@ bool binary_search(ForwardIterator first,
 template <typename ForwardIterator, typename T, typename StrictWeakOrdering>
 bool binary_search(ForwardIterator first,
                    ForwardIterator last,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
     using thrust::system::detail::generic::select_system;
@@ -360,9 +355,9 @@ equal_range(ForwardIterator first,
 //////////////////////
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator lower_bound(ForwardIterator first, 
+OutputIterator lower_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -380,9 +375,9 @@ OutputIterator lower_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator lower_bound(ForwardIterator first, 
+OutputIterator lower_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -399,11 +394,11 @@ OutputIterator lower_bound(ForwardIterator first,
 
     return thrust::lower_bound(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
 }
-    
+
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator upper_bound(ForwardIterator first, 
+OutputIterator upper_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output)
 {
@@ -421,9 +416,9 @@ OutputIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator upper_bound(ForwardIterator first, 
+OutputIterator upper_bound(ForwardIterator first,
                            ForwardIterator last,
-                           InputIterator values_first, 
+                           InputIterator values_first,
                            InputIterator values_last,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -442,9 +437,9 @@ OutputIterator upper_bound(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator>
-OutputIterator binary_search(ForwardIterator first, 
+OutputIterator binary_search(ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output)
 {
@@ -462,9 +457,9 @@ OutputIterator binary_search(ForwardIterator first,
 }
 
 template <typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
-OutputIterator binary_search(ForwardIterator first, 
+OutputIterator binary_search(ForwardIterator first,
                              ForwardIterator last,
-                             InputIterator values_first, 
+                             InputIterator values_first,
                              InputIterator values_last,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -482,5 +477,4 @@ OutputIterator binary_search(ForwardIterator first,
     return thrust::binary_search(select_system(system1,system2,system3), first, last, values_first, values_last, output, comp);
 }
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/caching_allocator.h b/thrust/detail/caching_allocator.h
new file mode 100644
index 000000000..941f52755
--- /dev/null
+++ b/thrust/detail/caching_allocator.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/disjoint_tls_pool.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/device_memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+inline
+thrust::mr::allocator<
+    char,
+    thrust::mr::disjoint_unsynchronized_pool_resource<
+        thrust::device_memory_resource,
+        thrust::mr::new_delete_resource
+    >
+> single_device_tls_caching_allocator()
+{
+    return {
+        &thrust::mr::tls_disjoint_pool(
+            thrust::mr::get_global_resource<thrust::device_memory_resource>(),
+            thrust::mr::get_global_resource<thrust::mr::new_delete_resource>()
+        )
+    };
+}
+}
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/arithmetic.h b/thrust/detail/complex/arithmetic.h
index 891853dad..518f18450 100644
--- a/thrust/detail/complex/arithmetic.h
+++ b/thrust/detail/complex/arithmetic.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,205 +15,289 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
+#include <thrust/detail/complex/c99math.h>
 #include <cfloat>
 #include <cmath>
-#include <thrust/detail/complex/c99math.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
   /* --- Binary Arithmetic Operators --- */
 
-template<typename ValueType>
-__host__ __device__ 
-inline complex<ValueType> operator+(const complex<ValueType>& lhs,
-				      const complex<ValueType>& rhs){
-  return complex<ValueType>(lhs.real()+rhs.real(),lhs.imag()+rhs.imag());
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() + y.real(), x.imag() + y.imag());
 }
 
-template<typename ValueType>
-__host__ __device__ 
-inline complex<ValueType> operator+(const volatile complex<ValueType>& lhs,
-				      const volatile complex<ValueType>& rhs){
-  return complex<ValueType>(lhs.real()+rhs.real(),lhs.imag()+rhs.imag());
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() + y, x.imag());
 }
 
-template <typename ValueType> 
-__host__ __device__ 
-inline complex<ValueType> operator+(const complex<ValueType>& lhs, const ValueType & rhs){
-  return complex<ValueType>(lhs.real()+rhs,lhs.imag());
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator+(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x + y.real(), y.imag());
 }
 
-template <typename ValueType> 
-__host__ __device__ 
-inline complex<ValueType> operator+(const ValueType& lhs, const complex<ValueType>& rhs){
-  return complex<ValueType>(rhs.real()+lhs,rhs.imag());
-}
 
-template <typename ValueType> 
-__host__ __device__ 
-inline complex<ValueType> operator-(const complex<ValueType>& lhs, const complex<ValueType>& rhs){
-  return complex<ValueType>(lhs.real()-rhs.real(),lhs.imag()-rhs.imag());
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() - y.real(), x.imag() - y.imag());
 }
 
-template <typename ValueType> 
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator-(const complex<ValueType>& lhs, const ValueType & rhs){
-  return complex<ValueType>(lhs.real()-rhs,lhs.imag());
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() - y, x.imag());
 }
 
-template <typename ValueType> 
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator-(const ValueType& lhs, const complex<ValueType>& rhs){
-  return complex<ValueType>(lhs-rhs.real(),-rhs.imag());
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator-(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x - y.real(), -y.imag());
 }
 
-template <typename ValueType> 
+
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator*(const complex<ValueType>& lhs,
-				      const complex<ValueType>& rhs){
-  return complex<ValueType>(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),
-			    lhs.real()*rhs.imag()+lhs.imag()*rhs.real());
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>( x.real() * y.real() - x.imag() * y.imag()
+			             , x.real() * y.imag() + x.imag() * y.real());
 }
 
-template <typename ValueType> 
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator*(const complex<ValueType>& lhs, const ValueType & rhs){
-  return complex<ValueType>(lhs.real()*rhs,lhs.imag()*rhs);
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() * y, x.imag() * y);
 }
 
-template <typename ValueType> 
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator*(const ValueType& lhs, const complex<ValueType>& rhs){
-  return complex<ValueType>(rhs.real()*lhs,rhs.imag()*lhs);
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator*(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x * y.real(), x * y.imag());
 }
 
 
-template <typename ValueType>
+template <typename T0, typename T1>
 __host__ __device__
-inline complex<ValueType> operator/(const complex<ValueType>& lhs, const complex<ValueType>& rhs){
-  ValueType s = std::abs(rhs.real()) + std::abs(rhs.imag());
-  ValueType oos = ValueType(1.0) / s;
-  ValueType ars = lhs.real() * oos;
-  ValueType ais = lhs.imag() * oos;
-  ValueType brs = rhs.real() * oos;
-  ValueType bis = rhs.imag() * oos;
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+
+  // Find `abs` by ADL.
+  using std::abs;
+
+  T s = abs(y.real()) + abs(y.imag());
+
+  T oos = T(1.0) / s;
+
+  T ars = x.real() * oos;
+  T ais = x.imag() * oos;
+  T brs = y.real() * oos;
+  T bis = y.imag() * oos;
+
   s = (brs * brs) + (bis * bis);
-  oos = ValueType(1.0) / s;
-  complex<ValueType> quot(((ars * brs) + (ais * bis)) * oos,
-			 ((ais * brs) - (ars * bis)) * oos);
+
+  oos = T(1.0) / s;
+
+  complex<T> quot( ((ars * brs) + (ais * bis)) * oos
+                 , ((ais * brs) - (ars * bis)) * oos);
   return quot;
 }
 
-template <typename ValueType> 
-  __host__ __device__
-  inline complex<ValueType> operator/(const complex<ValueType>& lhs, const ValueType & rhs){
-  return complex<ValueType>(lhs.real()/rhs,lhs.imag()/rhs);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x.real() / y, x.imag() / y);
 }
 
-template <typename ValueType>
-  __host__ __device__
-  inline complex<ValueType> operator/(const ValueType& lhs, const complex<ValueType>& rhs){
-  return complex<ValueType>(lhs)/rhs;
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+operator/(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return complex<T>(x) / y;
 }
 
 
 
 /* --- Unary Arithmetic Operators --- */
 
-template <typename ValueType> 
-  __host__ __device__
-  inline complex<ValueType> operator+(const complex<ValueType>& rhs){
-  return rhs;
+template <typename T>
+__host__ __device__
+complex<T> operator+(const complex<T>& y)
+{
+  return y;
 }
 
-template <typename ValueType> 
-  __host__ __device__
-  inline complex<ValueType> operator-(const complex<ValueType>& rhs){
-  return rhs*-ValueType(1);
+template <typename T>
+__host__ __device__
+complex<T> operator-(const complex<T>& y)
+{
+  return y * -T(1);
 }
 
 
 /* --- Other Basic Arithmetic Functions --- */
 
 // As std::hypot is only C++11 we have to use the C interface
-template <typename ValueType>
-  __host__ __device__
-  inline ValueType abs(const complex<ValueType>& z){
-  return hypot(z.real(),z.imag());
+template <typename T>
+__host__ __device__
+T abs(const complex<T>& z)
+{
+  return hypot(z.real(), z.imag());
 }
 
-namespace detail{
-namespace complex{	
-__host__ __device__ inline float abs(const thrust::complex<float>& z){
+// XXX Why are we specializing here?
+namespace detail {
+namespace complex {
+
+__host__ __device__
+inline float abs(const thrust::complex<float>& z)
+{
   return hypotf(z.real(),z.imag());
 }
 
-__host__ __device__ inline double abs(const thrust::complex<double>& z){
+__host__ __device__
+inline double abs(const thrust::complex<double>& z)
+{
   return hypot(z.real(),z.imag());
 }
-}
-}
+
+} // end namespace complex
+} // end namespace detail
 
 template <>
-  __host__ __device__
-  inline float abs(const complex<float>& z){
+__host__ __device__
+inline float abs(const complex<float>& z)
+{
   return detail::complex::abs(z);
 }
-template<>
-  __host__ __device__
-  inline double abs(const complex<double>& z){
+
+template <>
+__host__ __device__
+inline double abs(const complex<double>& z)
+{
   return detail::complex::abs(z);
 }
 
 
-template <typename ValueType>
-  __host__ __device__
-  inline ValueType arg(const complex<ValueType>& z){
-  return std::atan2(z.imag(),z.real());
+template <typename T>
+__host__ __device__
+T arg(const complex<T>& z)
+{
+  // Find `atan2` by ADL.
+  using std::atan2;
+  return atan2(z.imag(), z.real());
 }
 
-template <typename ValueType>
-  __host__ __device__
-  inline complex<ValueType> conj(const complex<ValueType>& z){
-  return complex<ValueType>(z.real(),-z.imag());
+
+template <typename T>
+__host__ __device__
+complex<T> conj(const complex<T>& z)
+{
+  return complex<T>(z.real(), -z.imag());
 }
 
-template <typename ValueType>
-  __host__ __device__
-  inline ValueType norm(const complex<ValueType>& z){
-  return z.real()*z.real() + z.imag()*z.imag();
+
+template <typename T>
+__host__ __device__
+T norm(const complex<T>& z)
+{
+  return z.real() * z.real() + z.imag() * z.imag();
 }
 
+// XXX Why specialize these, we could just rely on ADL.
 template <>
-  __host__ __device__
-  inline float norm(const complex<float>& z){
-  if(std::abs(z.real()) < ::sqrtf(FLT_MIN) && std::abs(z.imag()) < ::sqrtf(FLT_MIN)){
-    float a = z.real()*4.0f;
-    float b = z.imag()*4.0f;
-    return (a*a+b*b)/16.0f;
-  } 
-  return z.real()*z.real() + z.imag()*z.imag();
+__host__ __device__
+inline float norm(const complex<float>& z)
+{
+  // Find `abs` and `sqrt` by ADL.
+  using std::abs;
+  using std::sqrt;
+
+  if (abs(z.real()) < sqrt(FLT_MIN) && abs(z.imag()) < sqrt(FLT_MIN))
+  {
+    float a = z.real() * 4.0f;
+    float b = z.imag() * 4.0f;
+    return (a * a + b * b) / 16.0f;
+  }
+
+  return z.real() * z.real() + z.imag() * z.imag();
 }
 
 template <>
-  __host__ __device__
-  inline double norm(const complex<double>& z){
-  if(std::abs(z.real()) < ::sqrt(DBL_MIN) && std::abs(z.imag()) < ::sqrt(DBL_MIN)){
-    double a = z.real()*4.0;
-    double b = z.imag()*4.0;
-    return (a*a+b*b)/16.0;
-  } 
-  return z.real()*z.real() + z.imag()*z.imag();
+__host__ __device__
+inline double norm(const complex<double>& z)
+{
+  // Find `abs` and `sqrt` by ADL.
+  using std::abs;
+  using std::sqrt;
+
+  if (abs(z.real()) < sqrt(DBL_MIN) && abs(z.imag()) < sqrt(DBL_MIN))
+  {
+    double a = z.real() * 4.0;
+    double b = z.imag() * 4.0;
+    return (a * a + b * b) / 16.0;
+  }
+
+  return z.real() * z.real() + z.imag() * z.imag();
 }
 
-template <typename ValueType>
-  __host__ __device__
-  inline complex<ValueType> polar(const ValueType & m, const ValueType & theta){ 
-  return complex<ValueType>(m * std::cos(theta),m * std::sin(theta));
-}
 
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+polar(const T0& m, const T1& theta)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+
+  // Find `cos` and `sin` by ADL.
+  using std::cos;
+  using std::sin;
+
+  return complex<T>(m * cos(theta), m * sin(theta));
 }
 
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/complex/c99math.h b/thrust/detail/complex/c99math.h
index 665b759ad..e735b850c 100644
--- a/thrust/detail/complex/c99math.h
+++ b/thrust/detail/complex/c99math.h
@@ -14,20 +14,22 @@
  *  See the License for the specific language governing permissions and
  *  limitations under the License.
  */
-#pragma once 
+#pragma once
 
+#include <thrust/detail/config.h>
+
+#include <math.h>
 #include <cmath>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace complex
 {
 
 // Define basic arithmetic functions so we can use them without explicit scope
-// keeping the code as close as possible to FreeBSDs for ease of maintenance. 
+// keeping the code as close as possible to FreeBSDs for ease of maintenance.
 // It also provides an easy way to support compilers with missing C99 functions.
 // When possible, just use the names in the global scope.
 // Some platforms define these as macros, others as free functions.
@@ -83,11 +85,11 @@ __host__ __device__ inline int isnan(double x){
 }
 
 __host__ __device__ inline int signbit(float x){
-  return (*((uint32_t *)&x)) & 0x80000000;
+  return ((*((uint32_t *)&x)) & 0x80000000) != 0 ? 1 : 0;
 }
 
 __host__ __device__ inline int signbit(double x){
-  return (*((uint32_t *)&x)) & 0x80000000;
+  return ((*((uint64_t *)&x)) & 0x8000000000000000) != 0ull ? 1 : 0;
 }
 
 __host__ __device__ inline int isfinite(float x){
@@ -100,35 +102,25 @@ __host__ __device__ inline int isfinite(double x){
 
 #else
 
-#  ifdef __CUDACC__
-
-// sometimes the CUDA toolkit provides these these names as macros,
-// sometimes functions in the global scope
-
-#    if (CUDA_VERSION >= 6500)
+#  if defined(__CUDACC__) && !(defined(__CUDA__) && defined(__clang__)) && !defined(_NVHPC_CUDA)
+// NVCC implements at least some signature of these as functions not macros.
 using ::isinf;
 using ::isnan;
 using ::signbit;
 using ::isfinite;
-
-#    else
-// these names are macros, we don't need to define them
-
-#    endif // CUDA_VERSION
-
 #  else
-// Some compilers do not provide these in the global scope
-// they are in std:: instead
+// Some compilers do not provide these in the global scope, because they are
+// supposed to be macros. The versions in `std` are supposed to be functions.
 // Since we're not compiling with nvcc, it's safe to use the functions in std::
 using std::isinf;
 using std::isnan;
 using std::signbit;
 using std::isfinite;
 #  endif // __CUDACC__
+#endif // _MSC_VER
 
 using ::atanh;
-#endif // _MSC_VER
-  
+
 #if defined _MSC_VER
 
 __host__ __device__ inline double copysign(double x, double y){
@@ -149,7 +141,7 @@ __host__ __device__ inline float copysignf(float x, float y){
 
 
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__) && !defined(_NVHPC_CUDA)
 
 // Simple approximation to log1p as Visual Studio is lacking one
 inline double log1p(double x){
@@ -159,7 +151,7 @@ inline double log1p(double x){
   }else{
     if(u > 2.0){
       // Use normal log for large arguments
-      return log(u); 
+      return log(u);
     }else{
       return log(u)*(x/(u-1.0));
     }
@@ -173,7 +165,7 @@ inline float log1pf(float x){
   }else{
     if(u > 2.0f){
       // Use normal log for large arguments
-      return logf(u); 
+      return logf(u);
     }else{
       return logf(u)*(x/(u-1.0f));
     }
@@ -201,5 +193,5 @@ inline double hypot(double x, double y){
 
 } // namespace detail
 
-} // namespace thrust
-      
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/catrig.h b/thrust/detail/complex/catrig.h
index 70adf03ff..4955ec5bf 100644
--- a/thrust/detail/complex/catrig.h
+++ b/thrust/detail/complex/catrig.h
@@ -48,27 +48,28 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cfloat>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
-namespace complex{		      	
+namespace complex{
 
 using thrust::complex;
 
 __host__ __device__
 inline void raise_inexact(){
-  const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */ 
+  const volatile float tiny = 7.888609052210118054117286e-31; /* 0x1p-100; */
   // needs the volatile to prevent compiler from ignoring it
   volatile float junk = 1 + tiny;
   (void)junk;
 }
 
 __host__ __device__ inline complex<double> clog_for_large_values(complex<double> z);
-  
+
 /*
  * Testing indicates that all these functions are accurate up to 4 ULP.
  * The functions casin(h) and cacos(h) are about 2.5 times slower than asinh.
@@ -146,7 +147,7 @@ f(double a, double b, double hypot_a_b)
     return (a / 2);
   return (a * a / (hypot_a_b + b) / 2);
 }
-  
+
 /*
  * All the hard work is contained in this function.
  * x and y are assumed positive or zero, and less than RECIP_EPSILON.
@@ -167,10 +168,10 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
   const double A_crossover = 10; /* Hull et al suggest 1.5, but 10 works better */
   const double FOUR_SQRT_MIN = 5.966672584960165394632772e-154; /* =0x1p-509; >= 4 * sqrt(DBL_MIN) */
   const double B_crossover = 0.6417; /* suggested by Hull et al */
-  
+
   R = hypot(x, y + 1);		/* |z+I| */
   S = hypot(x, y - 1);		/* |z-I| */
-  
+
   /* A = (|z+I| + |z-I|) / 2 */
   A = (R + S) / 2;
   /*
@@ -180,7 +181,7 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
    */
   if (A < 1)
     A = 1;
-  
+
   if (A < A_crossover) {
     /*
      * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
@@ -214,9 +215,9 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
   } else {
     *rx = log(A + sqrt(A * A - 1));
   }
-  
+
   *new_y = y;
-  
+
   if (y < FOUR_SQRT_MIN) {
     /*
      * Avoid a possible underflow caused by y/A.  For casinh this
@@ -228,11 +229,11 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
     *new_y = y * (2 / DBL_EPSILON);
     return;
   }
-  
+
   /* B = (|z+I| - |z-I|) / 2 = y/A */
   *B = y / A;
   *B_is_usable = 1;
-  
+
   if (*B > B_crossover) {
     *B_is_usable = 0;
     /*
@@ -274,7 +275,7 @@ do_hard_work(double x, double y, double *rx, int *B_is_usable, double *B,
     }
   }
 }
-  
+
 /*
  * casinh(z) = z + O(z^3)   as z -> 0
  *
@@ -295,7 +296,7 @@ complex<double> casinh(complex<double> z)
   y = z.imag();
   ax = fabs(x);
   ay = fabs(y);
-  
+
   if (isnan(x) || isnan(y)) {
     /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
     if (isinf(x))
@@ -350,10 +351,10 @@ __host__ __device__ inline
 complex<double> casin(complex<double> z)
 {
   complex<double> w = casinh(complex<double>(z.imag(), z.real()));
-  
+
   return (complex<double>(w.imag(), w.real()));
 }
-  
+
 /*
  * cacos(z) = PI/2 - casin(z)
  * but do the computation carefully so cacos(z) is accurate when z is
@@ -450,7 +451,7 @@ complex<double> cacosh(complex<double> z)
 {
   complex<double> w;
   double rx, ry;
-  
+
   w = cacos(z);
   rx = w.real();
   ry = w.imag();
@@ -476,7 +477,7 @@ complex<double> clog_for_large_values(complex<double> z)
   double x, y;
   double ax, ay, t;
   const double m_e = 2.7182818284590452e0; /*  0x15bf0a8b145769.0p-51 */
-  
+
   x = z.real();
   y = z.imag();
   ax = fabs(x);
@@ -486,7 +487,7 @@ complex<double> clog_for_large_values(complex<double> z)
     ax = ay;
     ay = t;
   }
-  
+
   /*
    * Avoid overflow in hypot() when x and y are both very large.
    * Divide x and y by E, and then add 1 to the logarithm.  This depends
@@ -496,7 +497,7 @@ complex<double> clog_for_large_values(complex<double> z)
    */
   if (ax > DBL_MAX / 2)
     return (complex<double>(log(hypot(x / m_e, y / m_e)) + 1, atan2(y, x)));
-  
+
   /*
    * Avoid overflow when x or y is large.  Avoid underflow when x or
    * y is small.
@@ -505,16 +506,16 @@ complex<double> clog_for_large_values(complex<double> z)
   const double SQRT_MIN =	1.491668146240041348658193e-154; /* = 0x1p-511; >= sqrt(DBL_MIN) */
   if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN)
     return (complex<double>(log(hypot(x, y)), atan2(y, x)));
-  
+
   return (complex<double>(log(ax * ax + ay * ay) / 2, atan2(y, x)));
 }
-  
+
 /*
  *				=================
  *				| catanh, catan |
  *				=================
  */
-  
+
 /*
    * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
    * Assumes x*x and y*y will not overflow.
@@ -529,10 +530,10 @@ inline double sum_squares(double x, double y)
   /* Avoid underflow when y is small. */
   if (y < SQRT_MIN)
     return (x * x);
-  
+
   return (x * x + y * y);
 }
-  
+
 /*
  * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
  * Assumes x and y are not NaN, and one of x and y is larger than
@@ -548,7 +549,7 @@ inline double real_part_reciprocal(double x, double y)
   double scale;
   uint32_t hx, hy;
   int32_t ix, iy;
-  
+
   /*
    * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
    * example 2.
@@ -574,8 +575,8 @@ inline double real_part_reciprocal(double x, double y)
   y *= scale;
   return (x / (x * x + y * y) * scale);
 }
-  
-  
+
+
 /*
  * catanh(z) = log((1+z)/(1-z)) / 2
  *           = log1p(4*x / |z-1|^2) / 4
@@ -588,28 +589,28 @@ inline double real_part_reciprocal(double x, double y)
  * Re(catanh(z)) = x/|z|^2 + O(x/z^4)
  *    as z -> infinity, uniformly in x
  */
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 __host__ __device__ inline
 complex<double> catanh(complex<double> z)
 {
   double x, y, ax, ay, rx, ry;
   const volatile double pio2_lo = 6.1232339957367659e-17; /*  0x11a62633145c07.0p-106 */
   const double pio2_hi = 1.5707963267948966e0;/*  0x1921fb54442d18.0p-52 */
-  
-  
+
+
   x = z.real();
   y = z.imag();
   ax = fabs(x);
   ay = fabs(y);
-  
+
   /* This helps handle many cases. */
   if (y == 0 && ax <= 1)
     return (complex<double>(atanh(x), y));
-  
+
   /* To ensure the same accuracy as atan(), and to filter out z = 0. */
   if (x == 0)
     return (complex<double>(x, atan(y)));
-  
+
   if (isnan(x) || isnan(y)) {
     /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
     if (isinf(x))
@@ -625,12 +626,12 @@ complex<double> catanh(complex<double> z)
      */
     return (complex<double>(x + 0.0 + (y + 0), x + 0.0 + (y + 0)));
   }
-  
+
   const double RECIP_EPSILON = 1.0 / DBL_EPSILON;
   if (ax > RECIP_EPSILON || ay > RECIP_EPSILON)
     return (complex<double>(real_part_reciprocal(x, y),
 			    copysign(pio2_hi + pio2_lo, y)));
-  
+
   const double SQRT_3_EPSILON = 2.5809568279517849e-8; /*  0x1bb67ae8584caa.0p-78 */
   if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
     /*
@@ -641,23 +642,23 @@ complex<double> catanh(complex<double> z)
     raise_inexact();
     return (z);
   }
-  
+
   const double m_ln2 = 6.9314718055994531e-1; /*  0x162e42fefa39ef.0p-53 */
   if (ax == 1 && ay < DBL_EPSILON)
     rx = (m_ln2 - log(ay)) / 2;
   else
     rx = log1p(4 * ax / sum_squares(ax - 1, ay)) / 4;
-  
+
   if (ax == 1)
     ry = atan2(2.0, -ay) / 2;
   else if (ay < DBL_EPSILON)
     ry = atan2(2 * ay, (1 - ax) * (1 + ax)) / 2;
   else
     ry = atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
-  
+
   return (complex<double>(copysign(rx, x), copysign(ry, y)));
 }
-  
+
 /*
  * catan(z) = reverse(catanh(reverse(z)))
  * where reverse(x + I*y) = y + I*x = I*conj(z).
@@ -691,20 +692,20 @@ inline complex<ValueType> asin(const complex<ValueType>& z){
   const complex<ValueType> i(0,1);
   return -i*asinh(i*z);
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> atan(const complex<ValueType>& z){
   const complex<ValueType> i(0,1);
   return -i*thrust::atanh(i*z);
 }
-  
+
 
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> acosh(const complex<ValueType>& z){
   thrust::complex<ValueType> ret((z.real() - z.imag()) * (z.real() + z.imag()) - ValueType(1.0),
-				 ValueType(2.0) * z.real() * z.imag());    
+				 ValueType(2.0) * z.real() * z.imag());
   ret = thrust::sqrt(ret);
   if (z.real() < ValueType(0.0)){
     ret = -ret;
@@ -716,43 +717,43 @@ inline complex<ValueType> acosh(const complex<ValueType>& z){
   }
   return ret;
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> asinh(const complex<ValueType>& z){
   return thrust::log(thrust::sqrt(z*z+ValueType(1))+z);
 }
-  
+
 template <typename ValueType>
 __host__ __device__
 inline complex<ValueType> atanh(const complex<ValueType>& z){
-  ValueType imag2 = z.imag() *  z.imag();   
+  ValueType imag2 = z.imag() *  z.imag();
   ValueType n = ValueType(1.0) + z.real();
   n = imag2 + n * n;
-  
+
   ValueType d = ValueType(1.0) - z.real();
   d = imag2 + d * d;
   complex<ValueType> ret(ValueType(0.25) * (std::log(n) - std::log(d)),0);
-  
+
   d = ValueType(1.0) -  z.real() * z.real() - imag2;
-  
+
   ret.imag(ValueType(0.5) * std::atan2(ValueType(2.0) * z.imag(), d));
   return ret;
 }
-  
+
 template <>
 __host__ __device__
 inline complex<double> acos(const complex<double>& z){
   return detail::complex::cacos(z);
 }
-  
+
 template <>
 __host__ __device__
 inline complex<double> asin(const complex<double>& z){
   return detail::complex::casin(z);
 }
-  
-#if __cplusplus >= 201103L || !defined _MSC_VER
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<double> atan(const complex<double>& z){
@@ -772,8 +773,8 @@ __host__ __device__
 inline complex<double> asinh(const complex<double>& z){
   return detail::complex::casinh(z);
 }
-  
-#if __cplusplus >= 201103L || !defined _MSC_VER
+
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<double> atanh(const complex<double>& z){
@@ -781,4 +782,4 @@ inline complex<double> atanh(const complex<double>& z){
 }
 #endif
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/catrigf.h b/thrust/detail/complex/catrigf.h
index db04c466a..c06791311 100644
--- a/thrust/detail/complex/catrigf.h
+++ b/thrust/detail/complex/catrigf.h
@@ -50,10 +50,11 @@
 
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
+#include <thrust/detail/config.h>
 #include <cfloat>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -166,7 +167,7 @@ casinhf(complex<float> z)
   float x, y, ax, ay, rx, ry, B, sqrt_A2my2, new_y;
   int B_is_usable;
   complex<float> w;
-  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  const float RECIP_EPSILON = 1.0f / FLT_EPSILON;
   const float m_ln2 = 6.9314718055994531e-1f; /*  0x162e42fefa39ef.0p-53 */
   x = z.real();
   y = z.imag();
@@ -245,7 +246,7 @@ complex<float> cacosf(complex<float> z)
     return (complex<float>(x + 0.0f + (y + 0), x + 0.0f + (y + 0)));
   }
 
-  const float RECIP_EPSILON = 1.0 / FLT_EPSILON;
+  const float RECIP_EPSILON = 1.0f / FLT_EPSILON;
   if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
     w = clog_for_large_values(z);
     rx = fabsf(w.imag());
@@ -386,13 +387,13 @@ inline float real_part_reciprocal(float x, float y)
   return (x / (x * x + y * y) * scale);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 __host__ __device__ inline
 complex<float> catanhf(complex<float> z)
 {
   float x, y, ax, ay, rx, ry;
-  const volatile float pio2_lo = 6.1232339957367659e-17; /*  0x11a62633145c07.0p-106 */
-  const float pio2_hi = 1.5707963267948966e0;/*  0x1921fb54442d18.0p-52 */
+  const volatile float pio2_lo = 6.1232339957367659e-17f; /*  0x11a62633145c07.0p-106 */
+  const float pio2_hi = 1.5707963267948966e0f;/*  0x1921fb54442d18.0p-52 */
 
 
   x = z.real();
@@ -421,7 +422,7 @@ complex<float> catanhf(complex<float> z)
     return (complex<float>(real_part_reciprocal(x, y),
 			   copysignf(pio2_hi + pio2_lo, y)));
 
-  const float SQRT_3_EPSILON = 5.9801995673e-4; /*  0x9cc471.0p-34 */
+  const float SQRT_3_EPSILON = 5.9801995673e-4f; /*  0x9cc471.0p-34 */
   if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
     raise_inexact();
     return (z);
@@ -467,7 +468,7 @@ inline complex<float> asin(const complex<float>& z){
   return detail::complex::casinf(z);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<float> atan(const complex<float>& z){
@@ -488,7 +489,7 @@ inline complex<float> asinh(const complex<float>& z){
   return detail::complex::casinhf(z);
 }
 
-#if __cplusplus >= 201103L || !defined _MSC_VER
+#if THRUST_CPP_DIALECT >= 2011 || THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
 template <>
 __host__ __device__
 inline complex<float> atanh(const complex<float>& z){
@@ -496,4 +497,4 @@ inline complex<float> atanh(const complex<float>& z){
 }
 #endif
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ccosh.h b/thrust/detail/complex/ccosh.h
index 300f08afc..722dfcd84 100644
--- a/thrust/detail/complex/ccosh.h
+++ b/thrust/detail/complex/ccosh.h
@@ -47,10 +47,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -210,4 +212,4 @@ inline thrust::complex<double> cosh(const thrust::complex<double>& z){
   return detail::complex::ccosh(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ccoshf.h b/thrust/detail/complex/ccoshf.h
index d33af7c4c..aa43f1208 100644
--- a/thrust/detail/complex/ccoshf.h
+++ b/thrust/detail/complex/ccoshf.h
@@ -48,10 +48,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -138,4 +140,4 @@ inline complex<float> cosh(const complex<float>& z){
   return detail::complex::ccoshf(z);
 }
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/cexp.h b/thrust/detail/complex/cexp.h
index 151df397b..c0c8c07d2 100644
--- a/thrust/detail/complex/cexp.h
+++ b/thrust/detail/complex/cexp.h
@@ -49,10 +49,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 /*
@@ -180,4 +182,4 @@ inline complex<double> exp(const complex<double>& z){
   return detail::complex::cexp(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/cexpf.h b/thrust/detail/complex/cexpf.h
index 6d85c45ed..cae030fe7 100644
--- a/thrust/detail/complex/cexpf.h
+++ b/thrust/detail/complex/cexpf.h
@@ -49,10 +49,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -158,4 +160,4 @@ inline complex<float> exp(const complex<float>& z){
   return detail::complex::cexpf(z);
 }    
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/clog.h b/thrust/detail/complex/clog.h
index 8d288df02..b727121c3 100644
--- a/thrust/detail/complex/clog.h
+++ b/thrust/detail/complex/clog.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -46,10 +46,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -58,7 +60,7 @@ using thrust::complex;
 /* round down to 18 = 54/3 bits */
 __host__ __device__ inline
 double trim(double x){
-  uint32_t hi;    
+  uint32_t hi;
   get_high_word(hi, x);
   insert_words(x, hi &0xfffffff8, 0);
   return x;
@@ -120,7 +122,7 @@ complex<double> clog(const complex<double>& z){
     return (complex<double>(std::log(hypot(x, y)), std::atan2(y, x)));
   }
 
-  /* 
+  /*
    * From this point on, we don't need to worry about underflow or
    * overflow in calculating ax*ax or ay*ay.
    */
@@ -183,7 +185,7 @@ complex<double> clog(const complex<double>& z){
   }
   return (complex<double>(0.5 * log1p(hm1), atan2(y, x)));
 }
-  
+
 } // namespace complex
 
 } // namespace detail
@@ -202,11 +204,11 @@ inline complex<double> log(const complex<double>& z){
 
 template <typename ValueType>
 __host__ __device__
-inline complex<ValueType> log10(const complex<ValueType>& z){ 
+inline complex<ValueType> log10(const complex<ValueType>& z){
   // Using the explicit literal prevents compile time warnings in
-  // devices that don't support doubles 
+  // devices that don't support doubles
   return thrust::log(z)/ValueType(2.30258509299404568402);
 }
 
-} // namespace thrust
-    
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/clogf.h b/thrust/detail/complex/clogf.h
index 7f3314ed2..c72370c42 100644
--- a/thrust/detail/complex/clogf.h
+++ b/thrust/detail/complex/clogf.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -45,10 +45,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -74,7 +76,7 @@ complex<float> clogf(const complex<float>& z){
   float ax, ay;
   float x0, y0, x1, y1, x2, y2, t, hm1;
   float val[12];
-  int i, sorted;	
+  int i, sorted;
   const float e = 2.7182818284590452354f;
 
   x = z.real();
@@ -102,7 +104,7 @@ complex<float> clogf(const complex<float>& z){
    */
   // For high values of ay -> hypotf(FLT_MAX,ay) = inf
   // We expect that for values at or below ay = 1e34f this should not happen
-  if (ay > 1e34f){ 
+  if (ay > 1e34f){
     return (complex<float>(std::log(hypotf(x / e, y / e)) + 1.0f, std::atan2(y, x)));
   }
   if (ax == 1.f) {
@@ -120,7 +122,7 @@ complex<float> clogf(const complex<float>& z){
     return (complex<float>(std::log(hypotf(x, y)), std::atan2(y, x)));
   }
 
-  /* 
+  /*
    * From this point on, we don't need to worry about underflow or
    * overflow in calculating ax*ax or ay*ay.
    */
@@ -194,5 +196,5 @@ inline complex<float> log(const complex<float>& z){
   return detail::complex::clogf(z);
 }
 
-} // namespace thrust
-    
+THRUST_NAMESPACE_END
+
diff --git a/thrust/detail/complex/complex.inl b/thrust/detail/complex/complex.inl
index ec1ab30e7..a00b81a4b 100644
--- a/thrust/detail/complex/complex.inl
+++ b/thrust/detail/complex/complex.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,76 +15,241 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /* --- Constructors --- */
 
+#if THRUST_CPP_DIALECT < 2011
 template <typename T>
-inline __host__ __device__  complex<T>
-::complex(const T & re, const T& im)
+__host__ __device__
+complex<T>::complex()
+{
+  real(T());
+  imag(T());
+}
+#endif
+
+template <typename T>
+__host__ __device__
+complex<T>::complex(const T& re)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{re, T()}
+{}
+#else
+{
+  real(re);
+  imag(T());
+}
+#endif
+
+
+template <typename T>
+__host__ __device__
+complex<T>::complex(const T& re, const T& im)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{re, im}
+{}
+#else
 {
   real(re);
   imag(im);
-} 
+}
+#endif
+
+#if THRUST_CPP_DIALECT < 2011
+template <typename T>
+__host__ __device__
+complex<T>::complex(const complex<T>& z)
+{
+  real(z.real());
+  imag(z.imag());
+}
+#endif
 
 template <typename T>
-template <typename X> 
-inline __host__ __device__ complex<T>
-::complex(const complex<X> & z)
+template <typename U>
+__host__ __device__
+complex<T>::complex(const complex<U>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  // We do a functional-style cast here to suppress conversion warnings.
+  : data{T(z.real()), T(z.imag())}
+{}
+#else
 {
-  // The explicit T() is there no prevent Visual Studio from complaining
-  // about potential loss of precision
   real(T(z.real()));
   imag(T(z.imag()));
-}  
+}
+#endif
 
 template <typename T>
-template <typename X> 
-inline __host__ complex<T>
-::complex(const std::complex<X> & z)
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>::complex(const std::complex<T>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  : data{THRUST_STD_COMPLEX_REAL(z), THRUST_STD_COMPLEX_IMAG(z)}
+{}
+#else
+{
+  real(THRUST_STD_COMPLEX_REAL(z));
+  imag(THRUST_STD_COMPLEX_IMAG(z));
+}
+#endif
+
+template <typename T>
+template <typename U>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>::complex(const std::complex<U>& z)
+#if THRUST_CPP_DIALECT >= 2011
+  // Initialize the storage in the member initializer list using C++ unicorn
+  // initialization. This allows `complex<T const>` to work.
+  // We do a functional-style cast here to suppress conversion warnings.
+  : data{T(THRUST_STD_COMPLEX_REAL(z)), T(THRUST_STD_COMPLEX_IMAG(z))}
+{}
+#else
+{
+  real(T(THRUST_STD_COMPLEX_REAL(z)));
+  imag(T(THRUST_STD_COMPLEX_IMAG(z)));
+}
+#endif
+
+
+
+/* --- Assignment Operators --- */
+
+template <typename T>
+__host__ __device__
+complex<T>& complex<T>::operator=(const T& re)
+{
+  real(re);
+  imag(T());
+  return *this;
+}
+
+#if THRUST_CPP_DIALECT < 2011
+template <typename T>
+__host__ __device__
+complex<T>& complex<T>::operator=(const complex<T>& z)
+{
+  real(z.real());
+  imag(z.imag());
+  return *this;
+}
+#endif
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator=(const complex<U>& z)
 {
-  // The explicit T() is there no prevent Visual Studio from complaining
-  // about potential loss of precision
   real(T(z.real()));
   imag(T(z.imag()));
-}  
+  return *this;
+}
+
+template <typename T>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>& complex<T>::operator=(const std::complex<T>& z)
+{
+  real(THRUST_STD_COMPLEX_REAL(z));
+  imag(THRUST_STD_COMPLEX_IMAG(z));
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ THRUST_STD_COMPLEX_DEVICE
+complex<T>& complex<T>::operator=(const std::complex<U>& z)
+{
+  real(T(THRUST_STD_COMPLEX_REAL(z)));
+  imag(T(THRUST_STD_COMPLEX_IMAG(z)));
+  return *this;
+}
 
 
 
 /* --- Compound Assignment Operators --- */
 
 template <typename T>
-__host__ __device__  inline 
-complex<T>& complex<T>::operator+=(const complex<T> z)
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator+=(const complex<U>& z)
 {
-  real(real()+z.real());
-  imag(imag()+z.imag());
+  *this = *this + z;
   return *this;
 }
 
 template <typename T>
+template <typename U>
 __host__ __device__
-inline complex<T>& complex<T>::operator-=(const complex<T> z)
+complex<T>& complex<T>::operator-=(const complex<U>& z)
 {
-  real(real()-z.real());
-  imag(imag()-z.imag());
+  *this = *this - z;
   return *this;
 }
 
 template <typename T>
+template <typename U>
 __host__ __device__
-inline complex<T>& complex<T>::operator*=(const complex<T> z)
+complex<T>& complex<T>::operator*=(const complex<U>& z)
 {
   *this = *this * z;
   return *this;
 }
 
 template <typename T>
+template <typename U>
 __host__ __device__
-inline complex<T>& complex<T>::operator/=(const complex<T> z)
+complex<T>& complex<T>::operator/=(const complex<U>& z)
+{
+  *this = *this / z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator+=(const U& z)
+{
+  *this = *this + z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator-=(const U& z)
+{
+  *this = *this - z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator*=(const U& z)
+{
+  *this = *this * z;
+  return *this;
+}
+
+template <typename T>
+template <typename U>
+__host__ __device__
+complex<T>& complex<T>::operator/=(const U& z)
 {
   *this = *this / z;
   return *this;
@@ -94,52 +259,80 @@ inline complex<T>& complex<T>::operator/=(const complex<T> z)
 
 /* --- Equality Operators --- */
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator==(const complex<T>& lhs, const complex<T>& rhs){
-  if(lhs.real() == rhs.real() && lhs.imag() == rhs.imag()){
-    return true;
-  }
-  return false;
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const complex<T1>& y)
+{
+  return x.real() == y.real() && x.imag() == y.imag();
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const complex<T0>& x, const std::complex<T1>& y)
+{
+  return x.real() == THRUST_STD_COMPLEX_REAL(y) && x.imag() == THRUST_STD_COMPLEX_IMAG(y);
+}
+
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator==(const std::complex<T0>& x, const complex<T1>& y)
+{
+  return THRUST_STD_COMPLEX_REAL(x) == y.real() && THRUST_STD_COMPLEX_IMAG(x) == y.imag();
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const T0& x, const complex<T1>& y)
+{
+  return x == y.real() && y.imag() == T1();
+}
+
+template <typename T0, typename T1>
+__host__ __device__
+bool operator==(const complex<T0>& x, const T1& y)
+{
+  return x.real() == y && x.imag() == T1();
 }
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator==(const T & lhs, const complex<T>& rhs){
-  if(lhs == rhs.real() && rhs.imag() == 0){
-    return true;
-  }
-  return false;
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const complex<T1>& y)
+{
+  return !(x == y);
 }
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator==(const complex<T> & lhs, const T& rhs){
-  if(lhs.real() == rhs && lhs.imag() == 0){
-    return true;
-  }
-  return false;
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const complex<T0>& x, const std::complex<T1>& y)
+{
+  return !(x == y);
 }
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator!=(const complex<T>& lhs, const complex<T>& rhs){
-  return !(lhs == rhs);
+template <typename T0, typename T1>
+__host__ THRUST_STD_COMPLEX_DEVICE
+bool operator!=(const std::complex<T0>& x, const complex<T1>& y)
+{
+  return !(x == y);
 }
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator!=(const T & lhs, const complex<T>& rhs){
-  return !(lhs == rhs);
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const T0& x, const complex<T1>& y)
+{
+  return !(x == y);
 }
 
-template <typename T> 
-  __host__ __device__
-  inline bool operator!=(const complex<T> & lhs, const T& rhs){
-  return !(lhs == rhs);
+template <typename T0, typename T1>
+__host__ __device__
+bool operator!=(const complex<T0>& x, const T1& y)
+{
+  return !(x == y);
 }
 
-} 
+template <typename T>
+struct proclaim_trivially_relocatable<complex<T> > : thrust::true_type {};
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/complex/arithmetic.h>
 #include <thrust/detail/complex/cproj.h>
@@ -148,7 +341,6 @@ template <typename T>
 #include <thrust/detail/complex/clog.h>
 #include <thrust/detail/complex/clogf.h>
 #include <thrust/detail/complex/cpow.h>
-#include <thrust/detail/complex/cpowf.h>
 #include <thrust/detail/complex/ccosh.h>
 #include <thrust/detail/complex/ccoshf.h>
 #include <thrust/detail/complex/csinh.h>
diff --git a/thrust/detail/complex/cpow.h b/thrust/detail/complex/cpow.h
index f397ecf53..c204c451f 100644
--- a/thrust/detail/complex/cpow.h
+++ b/thrust/detail/complex/cpow.h
@@ -17,59 +17,41 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust{
-
-template <typename T>
-  __host__ __device__
-  inline complex<T> pow(const complex<T>& z, const complex<T> & exponent){
-  return thrust::exp(thrust::log(z)*exponent);
-}
+THRUST_NAMESPACE_BEGIN
 
-/* This function should be changed as soon as FreeBSD's msun gets a cpow function */
-template <>
-  __host__ __device__
-  inline complex<double> pow(const complex<double>& z, const complex<double> & exponent){
-  return thrust::exp(thrust::log(z)*exponent);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return exp(log(complex<T>(x)) * complex<T>(y));
 }
 
-template <typename T>
-  __host__ __device__
-  inline complex<T> pow(const complex<T>& z, const T & exponent){
-  return thrust::exp(thrust::log(z)*exponent);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const complex<T0>& x, const T1& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  return exp(log(complex<T>(x)) * T(y));
 }
 
-template <typename T>
-  __host__ __device__
-  inline complex<T> pow(const T & x, const complex<T> & exponent){
-  return thrust::exp(std::log(x)*exponent);
+template <typename T0, typename T1>
+__host__ __device__
+complex<typename detail::promoted_numerical_type<T0, T1>::type>
+pow(const T0& x, const complex<T1>& y)
+{
+  typedef typename detail::promoted_numerical_type<T0, T1>::type T;
+  // Find `log` by ADL.
+  using std::log;
+  return exp(log(T(x)) * complex<T>(y));
 }
 
-#if !defined _MSC_VER
+THRUST_NAMESPACE_END
 
-template <typename T, typename U>
-  __host__ __device__ 
-  inline complex<typename detail::promoted_numerical_type<T,U>::type > pow(const complex<T>& z, const complex<T>& exponent){
-  typedef typename detail::promoted_numerical_type<T,U>::type PromotedType;
-  return thrust::exp(thrust::log(complex<PromotedType>(z))*complex<PromotedType>(exponent));
-}
-
-template <typename T, typename U>
-  __host__ __device__ 
-  inline complex<typename detail::promoted_numerical_type<T,U>::type > pow(const complex<T>& z, const U& exponent){
-  typedef typename detail::promoted_numerical_type<T,U>::type PromotedType;
-  return thrust::exp(thrust::log(complex<PromotedType>(z))*PromotedType(exponent));
-}
-
-template <typename T, typename U>
-  __host__ __device__ 
-  inline complex<typename detail::promoted_numerical_type<T,U>::type > pow(const T& x, const complex<U>& exponent){
-  typedef typename detail::promoted_numerical_type<T,U>::type PromotedType;
-  return thrust::exp(std::log(PromotedType(x))*complex<PromotedType>(exponent));
-}
-
-#endif
-
-}
diff --git a/thrust/detail/complex/cpowf.h b/thrust/detail/complex/cpowf.h
deleted file mode 100644
index 715958c88..000000000
--- a/thrust/detail/complex/cpowf.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- * Copyright 2013 Filipe Maia
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License. 
- */
-#pragma once
-
-#include <thrust/complex.h>
-
-namespace thrust{
-
-/* This function should be changed as soon as FreeBSD's msun gets a cpowf function */
-template <>
-__host__ __device__
-inline complex<float> pow(const complex<float>& z, const complex<float> & exponent){
-  return thrust::exp(thrust::log(z)*exponent);
-}
-
-}
diff --git a/thrust/detail/complex/cproj.h b/thrust/detail/complex/cproj.h
index bc2fa7a17..7537c99fd 100644
--- a/thrust/detail/complex/cproj.h
+++ b/thrust/detail/complex/cproj.h
@@ -17,11 +17,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{	 
 __host__ __device__
@@ -67,6 +69,4 @@ inline thrust::complex<float> proj(const thrust::complex<float>& z){
   return detail::complex::cprojf(z);
 }
 
-}
-
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csinh.h b/thrust/detail/complex/csinh.h
index 42d831d9b..b5a22af01 100644
--- a/thrust/detail/complex/csinh.h
+++ b/thrust/detail/complex/csinh.h
@@ -48,17 +48,19 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
 using thrust::complex;
 
 __host__ __device__ inline
-complex<float> csinh(const complex<double>& z){
+complex<double> csinh(const complex<double>& z){
   double x, y, h;
   uint32_t hx, hy, ix, iy, lx, ly;
   const double huge = 8.98846567431157953864652595395e+307; // 0x1p1023;
@@ -202,4 +204,4 @@ inline complex<double> sinh(const complex<double>& z){
   return detail::complex::csinh(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csinhf.h b/thrust/detail/complex/csinhf.h
index bf4fb0816..d271081c6 100644
--- a/thrust/detail/complex/csinhf.h
+++ b/thrust/detail/complex/csinhf.h
@@ -48,10 +48,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -139,4 +141,4 @@ inline complex<float> sinh(const complex<float>& z){
   return detail::complex::csinhf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csqrt.h b/thrust/detail/complex/csqrt.h
index dcffbee95..eb4da5289 100644
--- a/thrust/detail/complex/csqrt.h
+++ b/thrust/detail/complex/csqrt.h
@@ -49,11 +49,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -149,4 +151,4 @@ inline complex<double> sqrt(const complex<double>& z){
   return detail::complex::csqrt(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/csqrtf.h b/thrust/detail/complex/csqrtf.h
index 125d4b60d..dba489a33 100644
--- a/thrust/detail/complex/csqrtf.h
+++ b/thrust/detail/complex/csqrtf.h
@@ -49,11 +49,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -144,4 +146,4 @@ inline complex<float> sqrt(const complex<float>& z){
   return detail::complex::csqrtf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ctanh.h b/thrust/detail/complex/ctanh.h
index 6ef159092..3275c0343 100644
--- a/thrust/detail/complex/ctanh.h
+++ b/thrust/detail/complex/ctanh.h
@@ -87,11 +87,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -197,4 +199,4 @@ inline complex<double> tanh(const complex<double>& z){
   return detail::complex::ctanh(z);
 }
   
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/ctanhf.h b/thrust/detail/complex/ctanhf.h
index f6923d1df..221b5ce47 100644
--- a/thrust/detail/complex/ctanhf.h
+++ b/thrust/detail/complex/ctanhf.h
@@ -52,11 +52,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 #include <thrust/detail/complex/math_private.h>
 #include <cmath>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{		      	
 
@@ -121,4 +123,4 @@ inline complex<float> tanh(const complex<float>& z){
   return detail::complex::ctanhf(z);
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/complex/math_private.h b/thrust/detail/complex/math_private.h
index bc2d6357f..3a40c8e72 100644
--- a/thrust/detail/complex/math_private.h
+++ b/thrust/detail/complex/math_private.h
@@ -35,7 +35,7 @@
 #include <thrust/complex.h>
 #include <thrust/detail/cstdint.h>
 
-namespace thrust{
+THRUST_NAMESPACE_BEGIN
 namespace detail{
 namespace complex{
 
@@ -130,7 +130,7 @@ void  extract_words(int32_t & ix0,int32_t & ix1, double d){
 
 } // namespace detail
 
-} // namespace thrust
+THRUST_NAMESPACE_END
 
 
 #include <thrust/detail/complex/c99math.h>
diff --git a/thrust/detail/complex/stream.h b/thrust/detail/complex/stream.h
index 9d87bbd54..95434b41b 100644
--- a/thrust/detail/complex/stream.h
+++ b/thrust/detail/complex/stream.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *  Copyright 2013 Filipe RNC Maia
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,26 +15,29 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/complex.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 template<typename ValueType,class charT, class traits>
 std::basic_ostream<charT, traits>& operator<<(std::basic_ostream<charT, traits>& os, const complex<ValueType>& z)
 {
   os << '(' << z.real() << ',' << z.imag() << ')';
   return os;
 }
-  
+
 template<typename ValueType, typename charT, class traits>
 std::basic_istream<charT, traits>&
 operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z)
 {
   ValueType re, im;
-    
+
   charT ch;
   is >> ch;
-    
+
   if(ch == '(')
     {
       is >> re >> ch;
@@ -68,4 +71,4 @@ operator>>(std::basic_istream<charT, traits>& is, complex<ValueType>& z)
   return is;
 }
 
-} // namespace thrust
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/config.h b/thrust/detail/config.h
index e1eb8dc58..5a5573a41 100644
--- a/thrust/detail/config.h
+++ b/thrust/detail/config.h
@@ -19,5 +19,6 @@
 
 #pragma once
 
+#include <thrust/version.h>
 #include <thrust/detail/config/config.h>
 
diff --git a/thrust/detail/config/compiler.h b/thrust/detail/config/compiler.h
index 45c4a43d9..e35652f6a 100644
--- a/thrust/detail/config/compiler.h
+++ b/thrust/detail/config/compiler.h
@@ -20,92 +20,170 @@
 
 #pragma once
 
-#ifdef __CUDACC__
-
-#include <cuda.h>
-
-// Thrust supports CUDA >= 3.0
-#if CUDA_VERSION < 3000
-#error "CUDA v3.0 or newer is required"
-#endif // CUDA_VERSION
-
-#endif // __CUDACC__
-
 // enumerate host compilers we know about
 #define THRUST_HOST_COMPILER_UNKNOWN 0
 #define THRUST_HOST_COMPILER_MSVC    1
 #define THRUST_HOST_COMPILER_GCC     2
 #define THRUST_HOST_COMPILER_CLANG   3
+#define THRUST_HOST_COMPILER_INTEL   4
 
 // enumerate device compilers we know about
 #define THRUST_DEVICE_COMPILER_UNKNOWN 0
 #define THRUST_DEVICE_COMPILER_MSVC    1
 #define THRUST_DEVICE_COMPILER_GCC     2
-#define THRUST_DEVICE_COMPILER_NVCC    3
-#define THRUST_DEVICE_COMPILER_CLANG   4
+#define THRUST_DEVICE_COMPILER_CLANG   3
+#define THRUST_DEVICE_COMPILER_NVCC    4
 
 // figure out which host compiler we're using
 // XXX we should move the definition of THRUST_DEPRECATED out of this logic
 #if   defined(_MSC_VER)
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_MSVC
-#define THRUST_DEPRECATED __declspec(deprecated)
+#define THRUST_MSVC_VERSION _MSC_VER
+#define THRUST_MSVC_VERSION_FULL _MSC_FULL_VER
+#elif defined(__ICC)
+#define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_INTEL
 #elif defined(__clang__)
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_CLANG
-#define THRUST_DEPRECATED __attribute__ ((deprecated)) 
 #define THRUST_CLANG_VERSION (__clang_major__ * 10000 + __clang_minor__ * 100 + __clang_patchlevel__)
 #elif defined(__GNUC__)
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_GCC
-#define THRUST_DEPRECATED __attribute__ ((deprecated)) 
 #define THRUST_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#if (THRUST_GCC_VERSION >= 50000)
+#define THRUST_MODERN_GCC
+#else
+#define THRUST_LEGACY_GCC
+#endif
 #else
 #define THRUST_HOST_COMPILER THRUST_HOST_COMPILER_UNKNOWN
-#define THRUST_DEPRECATED
 #endif // THRUST_HOST_COMPILER
 
 // figure out which device compiler we're using
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_MSVC
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_GCC
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+// CUDA-capable clang should behave similar to NVCC.
+#if defined(__CUDA__)
+#define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_NVCC
+#else
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_CLANG
+#endif
 #else
 #define THRUST_DEVICE_COMPILER THRUST_DEVICE_COMPILER_UNKNOWN
 #endif
 
 // is the device compiler capable of compiling omp?
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(_NVHPC_STDPAR_OPENMP)
 #define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_TRUE
 #else
 #define THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE THRUST_FALSE
 #endif // _OPENMP
 
-// disable specific MSVC warnings
+
 #if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && !defined(__CUDA_ARCH__)
-#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x) \
-__pragma(warning(push)) \
-__pragma(warning(disable : x))
-#define __THRUST_DISABLE_MSVC_WARNING_END(x) \
-__pragma(warning(pop))
+  #define THRUST_DISABLE_MSVC_WARNING_BEGIN(x)                                \
+    __pragma(warning(push))                                                   \
+    __pragma(warning(disable : x))                                            \
+    /**/
+  #define THRUST_DISABLE_MSVC_WARNING_END(x)                                  \
+    __pragma(warning(pop))                                                    \
+    /**/
+#else
+  #define THRUST_DISABLE_MSVC_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_MSVC_WARNING_END(x)
+#endif
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) && !defined(__CUDA_ARCH__)
+  #define THRUST_IGNORE_CLANG_WARNING_IMPL(x)                                 \
+    THRUST_PP_STRINGIZE(clang diagnostic ignored x)                           \
+    /**/
+  #define THRUST_IGNORE_CLANG_WARNING(x)                                      \
+    THRUST_IGNORE_CLANG_WARNING_IMPL(THRUST_PP_STRINGIZE(x))                  \
+    /**/
+
+  #define THRUST_DISABLE_CLANG_WARNING_BEGIN(x)                               \
+    _Pragma("clang diagnostic push")                                          \
+    _Pragma(THRUST_IGNORE_CLANG_WARNING(x))                                   \
+    /**/
+  #define THRUST_DISABLE_CLANG_WARNING_END(x)                                 \
+    _Pragma("clang diagnostic pop")                                           \
+    /**/
+#else
+  #define THRUST_DISABLE_CLANG_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_CLANG_WARNING_END(x)
+#endif
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && !defined(__CUDA_ARCH__)
+  #define THRUST_IGNORE_GCC_WARNING_IMPL(x)                                   \
+    THRUST_PP_STRINGIZE(GCC diagnostic ignored x)                             \
+    /**/
+  #define THRUST_IGNORE_GCC_WARNING(x)                                        \
+    THRUST_IGNORE_GCC_WARNING_IMPL(THRUST_PP_STRINGIZE(x))                    \
+    /**/
+
+  #define THRUST_DISABLE_GCC_WARNING_BEGIN(x)                                 \
+    _Pragma("GCC diagnostic push")                                            \
+    _Pragma(THRUST_IGNORE_GCC_WARNING(x))                                     \
+    /**/
+  #define THRUST_DISABLE_GCC_WARNING_END(x)                                   \
+    _Pragma("GCC diagnostic pop")                                             \
+    /**/
 #else
-#define __THRUST_DISABLE_MSVC_WARNING_BEGIN(x)
-#define __THRUST_DISABLE_MSVC_WARNING_END(x)
+  #define THRUST_DISABLE_GCC_WARNING_BEGIN(x)
+  #define THRUST_DISABLE_GCC_WARNING_END(x)
 #endif
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(x) \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267) \
-x;\
-__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267)
-#define __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END \
-__THRUST_DISABLE_MSVC_WARNING_END(4244 4267)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL(x) \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800) \
-x;\
-__THRUST_DISABLE_MSVC_WARNING_END(4800)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN \
-__THRUST_DISABLE_MSVC_WARNING_BEGIN(4800)
-#define __THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END \
-__THRUST_DISABLE_MSVC_WARNING_END(4800)
+
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN               \
+  THRUST_DISABLE_MSVC_WARNING_BEGIN(4244 4267)                                \
+  /**/
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END                 \
+  THRUST_DISABLE_MSVC_WARNING_END(4244 4267)                                  \
+  /**/
+#define THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(x)                  \
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN                     \
+  x;                                                                          \
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END                       \
+  /**/
+
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN               \
+  THRUST_DISABLE_MSVC_WARNING_BEGIN(4800)                                     \
+  /**/
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END                 \
+  THRUST_DISABLE_MSVC_WARNING_END(4800)                                       \
+  /**/
+#define THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING(x)                  \
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN                     \
+  x;                                                                          \
+  THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END                       \
+  /**/
+
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_BEGIN                    \
+  THRUST_DISABLE_CLANG_WARNING_BEGIN(-Wself-assign)                           \
+  /**/
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_END                      \
+  THRUST_DISABLE_CLANG_WARNING_END(-Wself-assign)                             \
+  /**/
+#define THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING(x)                       \
+  THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_BEGIN                          \
+  x;                                                                          \
+  THRUST_DISABLE_CLANG_SELF_ASSIGNMENT_WARNING_END                            \
+  /**/
+
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN     \
+  THRUST_DISABLE_CLANG_WARNING_BEGIN(-Wreorder)                               \
+  THRUST_DISABLE_GCC_WARNING_BEGIN(-Wreorder)                                 \
+  /**/
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END       \
+  THRUST_DISABLE_CLANG_WARNING_END(-Wreorder)                                 \
+  THRUST_DISABLE_GCC_WARNING_END(-Wreorder)                                   \
+  /**/
+#define THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING(x)        \
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN           \
+  x;                                                                          \
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END             \
+  /**/
+
+
diff --git a/thrust/detail/config/compiler_fence.h b/thrust/detail/config/compiler_fence.h
index 7b8097f03..c379abaf3 100644
--- a/thrust/detail/config/compiler_fence.h
+++ b/thrust/detail/config/compiler_fence.h
@@ -17,6 +17,14 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/preprocessor.h>
+
+// TODO: Enable this or remove this file once nvGRAPH/CUSP migrates off of it.
+//#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+//  #pragma message("warning: The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 or C11 atomics instead.")
+//#else
+//  #warning The functionality in this header is unsafe, deprecated, and will soon be removed. Use C++11 or C11 atomics instead.
+//#endif
 
 // msvc case
 #if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
diff --git a/thrust/detail/config/config.h b/thrust/detail/config/config.h
index e2bcfa503..797f6605b 100644
--- a/thrust/detail/config/config.h
+++ b/thrust/detail/config/config.h
@@ -20,18 +20,21 @@
 
 #pragma once
 
-// XXX the order of these #includes matters
+// NOTE: The order of these #includes matters.
 
 #include <thrust/detail/config/simple_defines.h>
 #include <thrust/detail/config/compiler.h>
+#include <thrust/detail/config/cpp_dialect.h>
+#include <thrust/detail/config/cpp_compatibility.h>
+#include <thrust/detail/config/deprecated.h>
 // host_system.h & device_system.h must be #included as early as possible
 // because other config headers depend on it
 #include <thrust/detail/config/host_system.h>
 #include <thrust/detail/config/device_system.h>
 #include <thrust/detail/config/host_device.h>
 #include <thrust/detail/config/debug.h>
-#include <thrust/detail/config/compiler_fence.h>
 #include <thrust/detail/config/forceinline.h>
 #include <thrust/detail/config/exec_check_disable.h>
 #include <thrust/detail/config/global_workarounds.h>
+#include <thrust/detail/config/namespace.h>
 
diff --git a/thrust/detail/config/cpp_compatibility.h b/thrust/detail/config/cpp_compatibility.h
new file mode 100644
index 000000000..18b9cbdcf
--- /dev/null
+++ b/thrust/detail/config/cpp_compatibility.h
@@ -0,0 +1,101 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#include <cstddef>
+
+#ifndef __has_cpp_attribute
+#  define __has_cpp_attribute(X) 0
+#endif
+
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+#  define THRUST_TRAILING_RETURN(...)
+#else
+#  define THRUST_TRAILING_RETURN(...) -> __VA_ARGS__
+#endif
+
+#if THRUST_CPP_DIALECT >= 2014 && __has_cpp_attribute(nodiscard)
+#  define THRUST_NODISCARD [[nodiscard]]
+#else
+#  define THRUST_NODISCARD
+#endif
+
+#if THRUST_CPP_DIALECT >= 2017 && __cpp_if_constexpr
+#  define THRUST_IF_CONSTEXPR if constexpr
+#else
+#  define THRUST_IF_CONSTEXPR if
+#endif
+
+// FIXME: Combine THRUST_INLINE_CONSTANT and
+// THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT into one macro when NVCC properly
+// supports `constexpr` globals in host and device code.
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+// FIXME: Add this when NVCC supports inline variables.
+//#  if   THRUST_CPP_DIALECT >= 2017
+//#    define THRUST_INLINE_CONSTANT                 inline constexpr
+//#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
+#  if THRUST_CPP_DIALECT >= 2011
+#    define THRUST_INLINE_CONSTANT                 static const __device__
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
+#  else
+#    define THRUST_INLINE_CONSTANT                 static const __device__
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static const
+#  endif
+#else
+// FIXME: Add this when NVCC supports inline variables.
+//#  if   THRUST_CPP_DIALECT >= 2017
+//#    define THRUST_INLINE_CONSTANT                 inline constexpr
+//#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT inline constexpr
+#  if THRUST_CPP_DIALECT >= 2011
+#    define THRUST_INLINE_CONSTANT                 static constexpr
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static constexpr
+#  else
+#    define THRUST_INLINE_CONSTANT                 static const
+#    define THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT static const
+#  endif
+#endif
+
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
+  #ifndef THRUST_IS_DEVICE_CODE
+    #if defined(_NVHPC_CUDA)
+      #define THRUST_IS_DEVICE_CODE __builtin_is_device_code()
+      #define THRUST_IS_HOST_CODE (!__builtin_is_device_code())
+      #define THRUST_INCLUDE_DEVICE_CODE 1
+      #define THRUST_INCLUDE_HOST_CODE 1
+    #elif defined(__CUDA_ARCH__)
+      #define THRUST_IS_DEVICE_CODE 1
+      #define THRUST_IS_HOST_CODE 0
+      #define THRUST_INCLUDE_DEVICE_CODE 1
+      #define THRUST_INCLUDE_HOST_CODE 0
+    #else
+      #define THRUST_IS_DEVICE_CODE 0
+      #define THRUST_IS_HOST_CODE 1
+      #define THRUST_INCLUDE_DEVICE_CODE 0
+      #define THRUST_INCLUDE_HOST_CODE 1
+    #endif
+  #endif
+#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
diff --git a/thrust/detail/config/cpp_dialect.h b/thrust/detail/config/cpp_dialect.h
new file mode 100644
index 000000000..46b0caec7
--- /dev/null
+++ b/thrust/detail/config/cpp_dialect.h
@@ -0,0 +1,140 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cpp_dialect.h
+ *  \brief Detect the version of the C++ standard used by the compiler.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+
+// Deprecation warnings may be silenced by defining the following macros. These
+// may be combined.
+// - THRUST_IGNORE_DEPRECATED_CPP_DIALECT:
+//   Ignore all deprecated C++ dialects and outdated compilers.
+// - THRUST_IGNORE_DEPRECATED_CPP_11:
+//   Ignore deprecation warnings when compiling with C++11. C++03 and outdated
+//   compilers will still issue warnings.
+// - THRUST_IGNORE_DEPRECATED_COMPILER
+//   Ignore deprecation warnings when using deprecated compilers. Compiling
+//   with C++03 and C++11 will still issue warnings.
+
+// Check for the CUB opt-outs as well:
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_DIALECT) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_DIALECT)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_CPP_11) && \
+     defined(CUB_IGNORE_DEPRECATED_CPP_11)
+#  define    THRUST_IGNORE_DEPRECATED_CPP_11
+#endif
+#if !defined(THRUST_IGNORE_DEPRECATED_COMPILER) && \
+     defined(CUB_IGNORE_DEPRECATED_COMPILER)
+#  define    THRUST_IGNORE_DEPRECATED_COMPILER
+#endif
+
+#ifdef THRUST_IGNORE_DEPRECATED_CPP_DIALECT
+#  define THRUST_IGNORE_DEPRECATED_CPP_11
+#  define THRUST_IGNORE_DEPRECATED_COMPILER
+#endif
+
+// Define this to override the built-in detection.
+#ifndef THRUST_CPP_DIALECT
+
+// MSVC does not define __cplusplus correctly. _MSVC_LANG is used instead.
+// This macro is only defined in MSVC 2015U3+.
+#  ifdef _MSVC_LANG // Do not replace with THRUST_HOST_COMPILER test (see above)
+// MSVC2015 reports C++14 but lacks extended constexpr support. Treat as C++11.
+#    if THRUST_MSVC_VERSION < 1910 && _MSVC_LANG > 201103L /* MSVC < 2017 && CPP > 2011 */
+#      define THRUST_CPLUSPLUS 201103L /* Fix to 2011 */
+#    else
+#      define THRUST_CPLUSPLUS _MSVC_LANG /* We'll trust this for now. */
+#    endif // MSVC 2015 C++14 fix
+#  else
+#    define THRUST_CPLUSPLUS __cplusplus
+#  endif
+
+// Detect current dialect:
+#  if THRUST_CPLUSPLUS < 201103L
+#    define THRUST_CPP_DIALECT 2003
+#  elif THRUST_CPLUSPLUS < 201402L
+#    define THRUST_CPP_DIALECT 2011
+#  elif THRUST_CPLUSPLUS < 201703L
+#    define THRUST_CPP_DIALECT 2014
+#  elif THRUST_CPLUSPLUS == 201703L
+#    define THRUST_CPP_DIALECT 2017
+#  elif THRUST_CPLUSPLUS > 201703L // unknown, but is higher than 2017.
+#    define THRUST_CPP_DIALECT 2020
+#  endif
+
+#  undef THRUST_CPLUSPLUS // cleanup
+
+#endif // !THRUST_CPP_DIALECT
+
+// Define THRUST_COMPILER_DEPRECATION macro:
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_COMP_DEPR_IMPL(msg) \
+    __pragma(message(__FILE__ ":" THRUST_COMP_DEPR_IMPL0(__LINE__) ": warning: " #msg))
+#  define THRUST_COMP_DEPR_IMPL0(x) THRUST_COMP_DEPR_IMPL1(x)
+#  define THRUST_COMP_DEPR_IMPL1(x) #x
+#else // clang / gcc:
+#  define THRUST_COMP_DEPR_IMPL(msg) THRUST_COMP_DEPR_IMPL0(GCC warning #msg)
+#  define THRUST_COMP_DEPR_IMPL0(expr) _Pragma(#expr)
+#  define THRUST_COMP_DEPR_IMPL1 /* intentionally blank */
+#endif
+
+#define THRUST_COMPILER_DEPRECATION(REQ) \
+  THRUST_COMP_DEPR_IMPL(Thrust requires at least REQ. Define THRUST_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#define THRUST_COMPILER_DEPRECATION_SOFT(REQ, CUR) \
+  THRUST_COMP_DEPR_IMPL(Thrust requires at least REQ. CUR is deprecated but still supported. CUR support will be removed in a future release. Define THRUST_IGNORE_DEPRECATED_CPP_DIALECT to suppress this message.)
+
+#ifndef THRUST_IGNORE_DEPRECATED_COMPILER
+
+// Compiler checks:
+#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && THRUST_GCC_VERSION < 50000
+     THRUST_COMPILER_DEPRECATION(GCC 5.0);
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG && THRUST_CLANG_VERSION < 70000
+     THRUST_COMPILER_DEPRECATION(Clang 7.0);
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && THRUST_MSVC_VERSION < 1910
+     // <2017. Hard upgrade message:
+     THRUST_COMPILER_DEPRECATION(MSVC 2019 (19.20/16.0/14.20));
+#  elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && THRUST_MSVC_VERSION < 1920
+     // >=2017, <2019. Soft deprecation message:
+     THRUST_COMPILER_DEPRECATION_SOFT(MSVC 2019 (19.20/16.0/14.20), MSVC 2017);
+#  endif
+
+#endif // THRUST_IGNORE_DEPRECATED_COMPILER
+
+#ifndef THRUST_IGNORE_DEPRECATED_DIALECT
+
+// Dialect checks:
+#  if THRUST_CPP_DIALECT < 2011
+     // <C++11. Hard upgrade message:
+     THRUST_COMPILER_DEPRECATION(C++14);
+#  elif THRUST_CPP_DIALECT == 2011 && !defined(THRUST_IGNORE_DEPRECATED_CPP_11)
+     // =C++11. Soft upgrade message:
+     THRUST_COMPILER_DEPRECATION_SOFT(C++14, C++11);
+#  endif
+
+#endif // THRUST_IGNORE_DEPRECATED_DIALECT
+
+#undef THRUST_COMPILER_DEPRECATION_SOFT
+#undef THRUST_COMPILER_DEPRECATION
+#undef THRUST_COMP_DEPR_IMPL
+#undef THRUST_COMP_DEPR_IMPL0
+#undef THRUST_COMP_DEPR_IMPL1
diff --git a/thrust/detail/config/deprecated.h b/thrust/detail/config/deprecated.h
new file mode 100644
index 000000000..05851c676
--- /dev/null
+++ b/thrust/detail/config/deprecated.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file deprecated.h
+ *  \brief Defines the THRUST_DEPRECATED macro
+ */
+
+#pragma once
+
+#include <thrust/detail/config/compiler.h>
+#include <thrust/detail/config/cpp_dialect.h>
+
+#if defined(CUB_IGNORE_DEPRECATED_API) && !defined(THRUST_IGNORE_DEPRECATED_API)
+#  define THRUST_IGNORE_DEPRECATED_API
+#endif
+
+#ifdef THRUST_IGNORE_DEPRECATED_API
+#  define THRUST_DEPRECATED
+#elif THRUST_CPP_DIALECT >= 2014
+#  define THRUST_DEPRECATED [[deprecated]]
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#  define THRUST_DEPRECATED __declspec(deprecated)
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
+#  define THRUST_DEPRECATED __attribute__((deprecated))
+#else
+#  define THRUST_DEPRECATED
+#endif
diff --git a/thrust/detail/config/device_system.h b/thrust/detail/config/device_system.h
index c4106d3fb..29418c903 100644
--- a/thrust/detail/config/device_system.h
+++ b/thrust/detail/config/device_system.h
@@ -26,25 +26,8 @@
 #define THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_CUDA
 #endif // THRUST_DEVICE_SYSTEM
 
-// XXX make the use of THRUST_DEVICE_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_DEVICE_BACKEND_CUDA THRUST_DEVICE_SYSTEM_CUDA
-#define THRUST_DEVICE_BACKEND_OMP  THRUST_DEVICE_SYSTEM_OMP
-#define THRUST_DEVICE_BACKEND_TBB  THRUST_DEVICE_SYSTEM_TBB
-
 #ifdef THRUST_DEVICE_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("----------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |")
-#    pragma message("----------------------------------------------------------------------------------")
-#  else
-#    warning ----------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_DEVICE_BACKEND is deprecated; use THRUST_DEVICE_SYSTEM instead |
-#    warning ----------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_DEVICE_SYSTEM
-#  define THRUST_DEVICE_SYSTEM THRUST_DEVICE_BACKEND
+#  error THRUST_DEVICE_BACKEND is no longer supported; use THRUST_DEVICE_SYSTEM instead.
 #endif // THRUST_DEVICE_BACKEND
 
 #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
diff --git a/thrust/detail/config/exec_check_disable.h b/thrust/detail/config/exec_check_disable.h
index acf2d0a45..9b25b375d 100644
--- a/thrust/detail/config/exec_check_disable.h
+++ b/thrust/detail/config/exec_check_disable.h
@@ -22,12 +22,18 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__)
-#  if __CUDAVER__ >= 75000
-#    define __thrust_exec_check_disable__ #pragma nv_exec_check_disable
-#  else
-#    define __thrust_exec_check_disable__ #pragma hd_warning_disable
-#  endif /* __CUDAVER__ */
+// #pragma nv_exec_check_disable is only recognized by NVCC.  Having a macro
+// expand to a #pragma (rather than _Pragma) only works with NVCC's compilation
+// model, not with other compilers.
+#if defined(__CUDACC__) && !defined(_NVHPC_CUDA) && \
+    !(defined(__CUDA__) && defined(__clang__))
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+#define __thrust_exec_check_disable__ __pragma("nv_exec_check_disable")
+#else // MSVC
+#define __thrust_exec_check_disable__ _Pragma("nv_exec_check_disable")
+#endif // MSVC
+
 #else
 
 #define __thrust_exec_check_disable__
diff --git a/thrust/detail/config/forceinline.h b/thrust/detail/config/forceinline.h
index 664130425..b001fd4b1 100644
--- a/thrust/detail/config/forceinline.h
+++ b/thrust/detail/config/forceinline.h
@@ -22,7 +22,7 @@
 
 #include <thrust/detail/config.h>
 
-#if defined(__CUDACC__)
+#if defined(__CUDACC__) || defined(_NVHPC_CUDA)
 
 #define __thrust_forceinline__ __forceinline__
 
diff --git a/thrust/detail/config/global_workarounds.h b/thrust/detail/config/global_workarounds.h
index a9015e846..9800f0359 100644
--- a/thrust/detail/config/global_workarounds.h
+++ b/thrust/detail/config/global_workarounds.h
@@ -20,7 +20,7 @@
 
 // XXX workaround gcc 4.8+'s complaints about unused local typedefs by silencing them globally
 #if defined(THRUST_GCC_VERSION) && (THRUST_GCC_VERSION >= 40800)
-#  if defined(__NVCC__) && (CUDA_VERSION >= 6000)
+#  if defined(__NVCC__) && (CUDART_VERSION >= 6000)
 #    pragma GCC diagnostic ignored "-Wunused-local-typedefs"
 #  endif // nvcc & cuda 6+
 #endif // gcc 4.8
diff --git a/thrust/detail/config/host_system.h b/thrust/detail/config/host_system.h
index 5c1387803..f216f6492 100644
--- a/thrust/detail/config/host_system.h
+++ b/thrust/detail/config/host_system.h
@@ -25,25 +25,8 @@
 #define THRUST_HOST_SYSTEM THRUST_HOST_SYSTEM_CPP
 #endif // THRUST_HOST_SYSTEM
 
-// XXX make the use of THRUST_HOST_BACKEND an error in Thrust 1.7
-// XXX eliminate the following in Thrust 1.7
-
-#define THRUST_HOST_BACKEND_CPP THRUST_HOST_SYSTEM_CPP
-#define THRUST_HOST_BACKEND_OMP THRUST_HOST_SYSTEM_OMP
-#define THRUST_HOST_BACKEND_TBB THRUST_HOST_SYSTEM_TBB
-
 #ifdef THRUST_HOST_BACKEND
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-#    pragma message("------------------------------------------------------------------------------")
-#    pragma message("| WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |")
-#    pragma message("------------------------------------------------------------------------------")
-#  else
-#    warning ------------------------------------------------------------------------------
-#    warning | WARNING: THRUST_HOST_BACKEND is deprecated; use THRUST_HOST_SYSTEM instead |
-#    warning ------------------------------------------------------------------------------
-#  endif // THRUST_HOST_COMPILER
-#  undef THRUST_HOST_SYSTEM
-#  define THRUST_HOST_SYSTEM THRUST_HOST_BACKEND
+#  error THRUST_HOST_BACKEND is no longer supported; use THRUST_HOST_SYSTEM instead.
 #endif // THRUST_HOST_BACKEND
 
 #if THRUST_HOST_SYSTEM == THRUST_HOST_SYSTEM_CPP
diff --git a/thrust/detail/config/memory_resource.h b/thrust/detail/config/memory_resource.h
new file mode 100644
index 000000000..ab719c9bd
--- /dev/null
+++ b/thrust/detail/config/memory_resource.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/config/cpp_compatibility.h>
+
+#define THRUST_MR_DEFAULT_ALIGNMENT THRUST_ALIGNOF(THRUST_NS_QUALIFIER::detail::max_align_t)
+
+#if THRUST_CPP_DIALECT >= 2017
+#  if __has_include(<memory_resource>)
+#    define THRUST_MR_STD_MR_HEADER <memory_resource>
+#    define THRUST_MR_STD_MR_NS std::pmr
+#  elif __has_include(<experimental/memory_resource>)
+#    define THRUST_MR_STD_MR_HEADER <experimental/memory_resource>
+#    define THRUST_MR_STD_MR_NS std::experimental::pmr
+#  endif
+#endif
diff --git a/thrust/detail/config/namespace.h b/thrust/detail/config/namespace.h
new file mode 100644
index 000000000..9c7904616
--- /dev/null
+++ b/thrust/detail/config/namespace.h
@@ -0,0 +1,120 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+/**
+ * \file namespace.h
+ * \brief Utilities that allow `thrust::` to be placed inside an
+ * application-specific namespace.
+ */
+
+/**
+ * \def THRUST_CUB_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` and `cub::` namespaces.
+ * This macro should not be used with any other Thrust namespace macros.
+ */
+#ifdef THRUST_CUB_WRAPPED_NAMESPACE
+#define THRUST_WRAPPED_NAMESPACE THRUST_CUB_WRAPPED_NAMESPACE
+#endif
+
+/**
+ * \def THRUST_WRAPPED_NAMESPACE
+ * If defined, this value will be used as the name of a namespace that wraps the
+ * `thrust::` namespace.
+ * If THRUST_CUB_WRAPPED_NAMESPACE is set, this will inherit that macro's value.
+ * This macro should not be used with any other Thrust namespace macros.
+ */
+#ifdef THRUST_WRAPPED_NAMESPACE
+#define THRUST_NS_PREFIX                                                       \
+  namespace THRUST_WRAPPED_NAMESPACE                                           \
+  {
+
+#define THRUST_NS_POSTFIX }
+
+#define THRUST_NS_QUALIFIER ::THRUST_WRAPPED_NAMESPACE::thrust
+#endif
+
+/**
+ * \def THRUST_NS_PREFIX
+ * This macro is inserted prior to all `namespace thrust { ... }` blocks. It is
+ * derived from THRUST_WRAPPED_NAMESPACE, if set, and will be empty otherwise.
+ * It may be defined by users, in which case THRUST_NS_PREFIX,
+ * THRUST_NS_POSTFIX, and THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_PREFIX
+#define THRUST_NS_PREFIX
+#endif
+
+/**
+ * \def THRUST_NS_POSTFIX
+ * This macro is inserted following the closing braces of all
+ * `namespace thrust { ... }` block. It is defined appropriately when
+ * THRUST_WRAPPED_NAMESPACE is set, and will be empty otherwise. It may be
+ * defined by users, in which case THRUST_NS_PREFIX, THRUST_NS_POSTFIX, and
+ * THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_POSTFIX
+#define THRUST_NS_POSTFIX
+#endif
+
+/**
+ * \def THRUST_NS_QUALIFIER
+ * This macro is used to qualify members of thrust:: when accessing them from
+ * outside of their namespace. By default, this is just `::thrust`, and will be
+ * set appropriately when THRUST_WRAPPED_NAMESPACE is defined. This macro may be
+ * defined by users, in which case THRUST_NS_PREFIX, THRUST_NS_POSTFIX, and
+ * THRUST_NS_QUALIFIER must all be set consistently.
+ */
+#ifndef THRUST_NS_QUALIFIER
+#define THRUST_NS_QUALIFIER ::thrust
+#endif
+
+/**
+ * \def THRUST_NAMESPACE_BEGIN
+ * This macro is used to open a `thrust::` namespace block, along with any
+ * enclosing namespaces requested by THRUST_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by Thrust and may not be overridden.
+ */
+#define THRUST_NAMESPACE_BEGIN                                                 \
+  THRUST_NS_PREFIX                                                             \
+  namespace thrust                                                             \
+  {
+
+/**
+ * \def THRUST_NAMESPACE_END
+ * This macro is used to close a `thrust::` namespace block, along with any
+ * enclosing namespaces requested by THRUST_WRAPPED_NAMESPACE, etc.
+ * This macro is defined by Thrust and may not be overridden.
+ */
+#define THRUST_NAMESPACE_END                                                   \
+  } /* end namespace thrust */                                                 \
+  THRUST_NS_POSTFIX
+
+// The following is just here to add docs for the thrust namespace:
+
+THRUST_NS_PREFIX
+
+/*! \namespace thrust
+ *  \brief \p thrust is the top-level namespace which contains all Thrust
+ *         functions and types.
+ */
+namespace thrust
+{
+}
+
+THRUST_NS_POSTFIX
diff --git a/thrust/detail/config/simple_defines.h b/thrust/detail/config/simple_defines.h
index 369fa6da5..e3ea2eb64 100644
--- a/thrust/detail/config/simple_defines.h
+++ b/thrust/detail/config/simple_defines.h
@@ -24,5 +24,7 @@
 #define THRUST_FALSE   0
 #define THRUST_TRUE    1
 
+#define THRUST_UNUSED_VAR(expr) do { (void)(expr); } while (0)
+
 #define THRUST_PREVENT_MACRO_SUBSTITUTION
 
diff --git a/thrust/detail/contiguous_storage.h b/thrust/detail/contiguous_storage.h
index 80b7906c8..536c1c27c 100644
--- a/thrust/detail/contiguous_storage.h
+++ b/thrust/detail/contiguous_storage.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -19,13 +19,15 @@
 #include <thrust/iterator/detail/normal_iterator.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
+struct copy_allocator_t {};
+
 // XXX parameter T is redundant with parameter Alloc
 template<typename T, typename Alloc>
   class contiguous_storage
@@ -40,14 +42,8 @@ template<typename T, typename Alloc>
     typedef typename alloc_traits::const_pointer       const_pointer;
     typedef typename alloc_traits::size_type           size_type;
     typedef typename alloc_traits::difference_type     difference_type;
-
-    // XXX we should bring reference & const_reference into allocator_traits
-    //     at the moment, it's unclear how -- we have nothing analogous to
-    //     rebind_pointer for references
-    //     we either need to add reference_traits or extend the existing
-    //     pointer_traits to support wrapped references
-    typedef typename Alloc::reference                  reference;
-    typedef typename Alloc::const_reference            const_reference;
+    typedef typename alloc_traits::reference           reference;
+    typedef typename alloc_traits::const_reference     const_reference;
 
     typedef thrust::detail::normal_iterator<pointer>       iterator;
     typedef thrust::detail::normal_iterator<const_pointer> const_iterator;
@@ -62,25 +58,39 @@ template<typename T, typename Alloc>
 
     __thrust_exec_check_disable__
     __host__ __device__
-    ~contiguous_storage(void);
+    explicit contiguous_storage(copy_allocator_t, const contiguous_storage &other);
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    explicit contiguous_storage(copy_allocator_t, const contiguous_storage &other, size_type n);
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    ~contiguous_storage();
+
+    __host__ __device__
+    size_type size() const;
 
     __host__ __device__
-    size_type size(void) const;
+    size_type max_size() const;
 
     __host__ __device__
-    size_type max_size(void) const;
+    pointer data();
 
     __host__ __device__
-    iterator begin(void);
-    
+    const_pointer data() const;
+
+    __host__ __device__
+    iterator begin();
+
     __host__ __device__
-    const_iterator begin(void) const;
+    const_iterator begin() const;
 
     __host__ __device__
-    iterator end(void);
+    iterator end();
 
     __host__ __device__
-    const_iterator end(void) const;
+    const_iterator end() const;
 
     __host__ __device__
     reference operator[](size_type n);
@@ -89,14 +99,14 @@ template<typename T, typename Alloc>
     const_reference operator[](size_type n) const;
 
     __host__ __device__
-    allocator_type get_allocator(void) const;
+    allocator_type get_allocator() const;
 
     // note that allocate does *not* automatically call deallocate
     __host__ __device__
     void allocate(size_type n);
 
     __host__ __device__
-    void deallocate(void);
+    void deallocate();
 
     __host__ __device__
     void swap(contiguous_storage &x);
@@ -132,16 +142,85 @@ template<typename T, typename Alloc>
     __host__ __device__
     void destroy(iterator first, iterator last);
 
+    __host__ __device__
+    void deallocate_on_allocator_mismatch(const contiguous_storage &other);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch(const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void set_allocator(const allocator_type &alloc);
+
+    __host__ __device__
+    bool is_allocator_not_equal(const allocator_type &alloc) const;
+
+    __host__ __device__
+    bool is_allocator_not_equal(const contiguous_storage &other) const;
+
+    __host__ __device__
+    void propagate_allocator(const contiguous_storage &other);
+
+#if THRUST_CPP_DIALECT >= 2011
+    __host__ __device__
+    void propagate_allocator(contiguous_storage &other);
+
+    // allow move assignment for a sane implementation of allocator propagation
+    // on move assignment
+    __host__ __device__
+    contiguous_storage &operator=(contiguous_storage &&other);
+#endif
+
   private:
     // XXX we could inherit from this to take advantage of empty base class optimization
     allocator_type m_allocator;
 
     iterator m_begin;
-    
+
     size_type m_size;
 
     // disallow assignment
     contiguous_storage &operator=(const contiguous_storage &x);
+
+    __host__ __device__
+    void swap_allocators(true_type, const allocator_type &);
+
+    __host__ __device__
+    void swap_allocators(false_type, allocator_type &);
+
+    __host__ __device__
+    bool is_allocator_not_equal_dispatch(true_type, const allocator_type &) const;
+
+    __host__ __device__
+    bool is_allocator_not_equal_dispatch(false_type, const allocator_type &) const;
+
+    __host__ __device__
+    void deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void destroy_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &other,
+        iterator first, iterator last);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(true_type, const contiguous_storage &other);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(false_type, const contiguous_storage &other);
+
+#if THRUST_CPP_DIALECT >= 2011
+    __host__ __device__
+    void propagate_allocator_dispatch(true_type, contiguous_storage &other);
+
+    __host__ __device__
+    void propagate_allocator_dispatch(false_type, contiguous_storage &other);
+#endif
 }; // end contiguous_storage
 
 } // end detail
@@ -150,7 +229,7 @@ template<typename T, typename Alloc>
 __host__ __device__
 void swap(detail::contiguous_storage<T,Alloc> &lhs, detail::contiguous_storage<T,Alloc> &rhs);
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/contiguous_storage.inl>
 
diff --git a/thrust/detail/contiguous_storage.inl b/thrust/detail/contiguous_storage.inl
index b5453e431..7ae8657f0 100644
--- a/thrust/detail/contiguous_storage.inl
+++ b/thrust/detail/contiguous_storage.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/contiguous_storage.h>
 #include <thrust/detail/swap.h>
 #include <thrust/detail/allocator/allocator_traits.h>
@@ -23,14 +24,26 @@
 #include <thrust/detail/allocator/default_construct_range.h>
 #include <thrust/detail/allocator/destroy_range.h>
 #include <thrust/detail/allocator/fill_construct_range.h>
+
+#include <nv/target>
+
+#include <stdexcept> // for std::runtime_error
 #include <utility> // for use of std::swap in the WAR below
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
+class allocator_mismatch_on_swap : public std::runtime_error
+{
+public:
+  allocator_mismatch_on_swap()
+    :std::runtime_error("swap called on containers with allocators that propagate on swap, but compare non-equal")
+  {
+  }
+};
+
 __thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
@@ -55,11 +68,34 @@ __host__ __device__
   allocate(n);
 } // end contiguous_storage::contiguous_storage()
 
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(copy_allocator_t,
+        const contiguous_storage &other)
+      :m_allocator(other.m_allocator),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+} // end contiguous_storage::contiguous_storage()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc>
+    ::contiguous_storage(copy_allocator_t,
+        const contiguous_storage &other, size_type n)
+      :m_allocator(other.m_allocator),
+       m_begin(pointer(static_cast<T*>(0))),
+       m_size(0)
+{
+  allocate(n);
+} // end contiguous_storage::contiguous_storage()
+
 __thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   contiguous_storage<T,Alloc>
-    ::~contiguous_storage(void)
+    ::~contiguous_storage()
 {
   deallocate();
 } // end contiguous_storage::~contiguous_storage()
@@ -68,7 +104,7 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::size_type
     contiguous_storage<T,Alloc>
-      ::size(void) const
+      ::size() const
 {
   return m_size;
 } // end contiguous_storage::size()
@@ -77,7 +113,7 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::size_type
     contiguous_storage<T,Alloc>
-      ::max_size(void) const
+      ::max_size() const
 {
   return alloc_traits::max_size(m_allocator);
 } // end contiguous_storage::max_size()
@@ -86,7 +122,7 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::iterator
     contiguous_storage<T,Alloc>
-      ::begin(void)
+      ::begin()
 {
   return m_begin;
 } // end contiguous_storage::begin()
@@ -95,7 +131,7 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::const_iterator
     contiguous_storage<T,Alloc>
-      ::begin(void) const
+      ::begin() const
 {
   return m_begin;
 } // end contiguous_storage::begin()
@@ -104,7 +140,7 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::iterator
     contiguous_storage<T,Alloc>
-      ::end(void)
+      ::end()
 {
   return m_begin + size();
 } // end contiguous_storage::end()
@@ -113,11 +149,29 @@ template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::const_iterator
     contiguous_storage<T,Alloc>
-      ::end(void) const
+      ::end() const
 {
   return m_begin + size();
 } // end contiguous_storage::end()
 
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::pointer
+    contiguous_storage<T,Alloc>
+      ::data()
+{
+  return &*m_begin;
+} // end contiguous_storage::data()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  typename contiguous_storage<T,Alloc>::const_pointer
+    contiguous_storage<T,Alloc>
+      ::data() const
+{
+  return &*m_begin;
+} // end contiguous_storage::data()
+
 template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::reference
@@ -136,11 +190,12 @@ __host__ __device__
   return m_begin[n];
 } // end contiguous_storage::operator[]()
 
+__thrust_exec_check_disable__
 template<typename T, typename Alloc>
 __host__ __device__
   typename contiguous_storage<T,Alloc>::allocator_type
     contiguous_storage<T,Alloc>
-      ::get_allocator(void) const
+      ::get_allocator() const
 {
   return m_allocator;
 } // end contiguous_storage::get_allocator()
@@ -165,7 +220,7 @@ __host__ __device__
 template<typename T, typename Alloc>
 __host__ __device__
   void contiguous_storage<T,Alloc>
-    ::deallocate(void)
+    ::deallocate()
 {
   if(size() > 0)
   {
@@ -183,6 +238,13 @@ __host__ __device__
   thrust::swap(m_begin, x.m_begin);
   thrust::swap(m_size, x.m_size);
 
+  swap_allocators(
+    integral_constant<
+      bool,
+      allocator_traits<Alloc>::propagate_on_container_swap::value
+    >(),
+    x.m_allocator);
+
   thrust::swap(m_allocator, x.m_allocator);
 } // end contiguous_storage::swap()
 
@@ -256,6 +318,226 @@ __host__ __device__
   destroy_range(m_allocator, first.base(), last - first);
 } // end contiguous_storage::destroy()
 
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch(const contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  deallocate_on_allocator_mismatch_dispatch(c, other);
+} // end contiguous_storage::deallocate_on_allocator_mismatch
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch(const contiguous_storage &other,
+        iterator first, iterator last)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  destroy_on_allocator_mismatch_dispatch(c, other, first, last);
+} // end contiguous_storage::destroy_on_allocator_mismatch
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::set_allocator(const Alloc &alloc)
+{
+  m_allocator = alloc;
+} // end contiguous_storage::set_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal(const Alloc &alloc) const
+{
+  return is_allocator_not_equal_dispatch(
+    integral_constant<
+      bool,
+      allocator_traits<Alloc>::is_always_equal::value
+    >(),
+    alloc);
+} // end contiguous_storage::is_allocator_not_equal()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal(const contiguous_storage<T,Alloc> &other) const
+{
+  return is_allocator_not_equal(m_allocator, other.m_allocator);
+} // end contiguous_storage::is_allocator_not_equal()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator(const contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_copy_assignment::value
+  > c;
+
+  propagate_allocator_dispatch(c, other);
+} // end contiguous_storage::propagate_allocator()
+
+#if THRUST_CPP_DIALECT >= 2011
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator(contiguous_storage &other)
+{
+  integral_constant<
+    bool,
+    allocator_traits<Alloc>::propagate_on_container_move_assignment::value
+  > c;
+
+  propagate_allocator_dispatch(c, other);
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  contiguous_storage<T,Alloc> &contiguous_storage<T,Alloc>
+    ::operator=(contiguous_storage &&other)
+{
+  if (size() > 0)
+  {
+    deallocate();
+  }
+  propagate_allocator(other);
+  m_begin = std::move(other.m_begin);
+  m_size = std::move(other.m_size);
+
+  other.m_begin = pointer(static_cast<T*>(0));
+  other.m_size = 0;
+
+  return *this;
+} // end contiguous_storage::propagate_allocator()
+#endif
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::swap_allocators(true_type, const Alloc &)
+{
+} // end contiguous_storage::swap_allocators()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::swap_allocators(false_type, Alloc &other)
+{
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // allocators must be equal when swapping containers with allocators that propagate on swap
+    assert(!is_allocator_not_equal(other));
+  ), (
+    if (is_allocator_not_equal(other))
+    {
+      throw allocator_mismatch_on_swap();
+    }
+  ));
+
+  thrust::swap(m_allocator, other);
+} // end contiguous_storage::swap_allocators()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal_dispatch(true_type /*is_always_equal*/, const Alloc &) const
+{
+  return false;
+} // end contiguous_storage::is_allocator_not_equal_dispatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  bool contiguous_storage<T,Alloc>
+    ::is_allocator_not_equal_dispatch(false_type /*!is_always_equal*/, const Alloc& other) const
+{
+  return m_allocator != other;
+} // end contiguous_storage::is_allocator_not_equal_dispatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other)
+{
+  if (m_allocator != other.m_allocator)
+  {
+    deallocate();
+  }
+} // end contiguous_storage::deallocate_on_allocator_mismatch()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::deallocate_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &)
+{
+} // end contiguous_storage::deallocate_on_allocator_mismatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch_dispatch(true_type, const contiguous_storage &other,
+        iterator first, iterator last)
+{
+  if (m_allocator != other.m_allocator)
+  {
+    destroy(first, last);
+  }
+} // end contiguous_storage::destroy_on_allocator_mismatch()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::destroy_on_allocator_mismatch_dispatch(false_type, const contiguous_storage &,
+        iterator, iterator)
+{
+} // end contiguous_storage::destroy_on_allocator_mismatch()
+
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(true_type, const contiguous_storage &other)
+{
+  m_allocator = other.m_allocator;
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(false_type, const contiguous_storage &)
+{
+} // end contiguous_storage::propagate_allocator()
+
+#if THRUST_CPP_DIALECT >= 2011
+__thrust_exec_check_disable__
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(true_type, contiguous_storage &other)
+{
+  m_allocator = std::move(other.m_allocator);
+} // end contiguous_storage::propagate_allocator()
+
+template<typename T, typename Alloc>
+__host__ __device__
+  void contiguous_storage<T,Alloc>
+    ::propagate_allocator_dispatch(false_type, contiguous_storage &)
+{
+} // end contiguous_storage::propagate_allocator()
+#endif
+
 } // end detail
 
 template<typename T, typename Alloc>
@@ -265,5 +547,4 @@ __host__ __device__
   lhs.swap(rhs);
 } // end swap()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/copy.h b/thrust/detail/copy.h
index 5e9feb0f9..d6c5bc805 100644
--- a/thrust/detail/copy.h
+++ b/thrust/detail/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename System,
          typename InputIterator,
@@ -85,7 +84,7 @@ __host__ __device__
 
 
 } // end detail
-} // end thrust
 
-#include <thrust/detail/copy.inl>
+THRUST_NAMESPACE_END
 
+#include <thrust/detail/copy.inl>
diff --git a/thrust/detail/copy.inl b/thrust/detail/copy.inl
index 85701fde7..4d62798c7 100644
--- a/thrust/detail/copy.inl
+++ b/thrust/detail/copy.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/detail/copy.h>
@@ -21,9 +22,7 @@
 #include <thrust/system/detail/generic/copy.h>
 #include <thrust/system/detail/adl/copy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename OutputIterator>
@@ -127,6 +126,4 @@ template<typename InputIterator,
   return thrust::detail::two_system_copy_n(system1, system2, first, n, result);
 } // end copy_n()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/copy_if.h b/thrust/detail/copy_if.h
index 563623c88..32eb5e083 100644
--- a/thrust/detail/copy_if.h
+++ b/thrust/detail/copy_if.h
@@ -19,9 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -68,8 +66,6 @@ template<typename InputIterator1,
                          OutputIterator result,
                          Predicate pred);
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/copy_if.inl>
-
diff --git a/thrust/detail/copy_if.inl b/thrust/detail/copy_if.inl
index f4c22f8a5..952541c51 100644
--- a/thrust/detail/copy_if.inl
+++ b/thrust/detail/copy_if.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/copy_if.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,9 +23,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/adl/copy_if.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -104,6 +104,4 @@ template<typename InputIterator1,
   return thrust::copy_if(select_system(system1,system2,system3), first, last, stencil, result, pred);
 } // end copy_if()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/count.h b/thrust/detail/count.h
new file mode 100644
index 000000000..7c48bc546
--- /dev/null
+++ b/thrust/detail/count.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename EqualityComparable>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+          InputIterator first,
+          InputIterator last,
+          const EqualityComparable& value);
+
+template<typename DerivedPolicy,
+         typename InputIterator,
+         typename Predicate>
+__host__ __device__
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+             InputIterator first,
+             InputIterator last,
+             Predicate pred);
+
+template <typename InputIterator,
+          typename EqualityComparable>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count(InputIterator first,
+          InputIterator last,
+          const EqualityComparable& value);
+
+template <typename InputIterator,
+          typename Predicate>
+  typename thrust::iterator_traits<InputIterator>::difference_type
+    count_if(InputIterator first,
+             InputIterator last,
+             Predicate pred);
+
+THRUST_NAMESPACE_END
+
+#include <thrust/detail/count.inl>
diff --git a/thrust/detail/count.inl b/thrust/detail/count.inl
index f7ba7a54e..5d1f628a9 100644
--- a/thrust/detail/count.inl
+++ b/thrust/detail/count.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file count.inl
- *  \brief Inline file for count.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/count.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/count.h>
 #include <thrust/system/detail/adl/count.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename EqualityComparable>
@@ -79,6 +74,4 @@ count_if(InputIterator first, InputIterator last, Predicate pred)
   return thrust::count_if(select_system(system), first, last, pred);
 } // end count_if()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/cpp11_required.h b/thrust/detail/cpp11_required.h
new file mode 100644
index 000000000..a7fb4fb12
--- /dev/null
+++ b/thrust/detail/cpp11_required.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#ifndef THRUST_CPP11_REQUIRED_NO_ERROR
+#  if THRUST_CPP_DIALECT < 2011 
+#    error C++11 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++XX flag to it.
+#  endif
+#endif
+
diff --git a/thrust/detail/cpp14_required.h b/thrust/detail/cpp14_required.h
new file mode 100644
index 000000000..083c8a1ad
--- /dev/null
+++ b/thrust/detail/cpp14_required.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config/cpp_dialect.h>
+
+#ifndef THRUST_CPP14_REQUIRED_NO_ERROR
+#  if THRUST_CPP_DIALECT < 2014
+#    error C++14 is required for this Thrust feature; please upgrade your compiler or pass the appropriate -std=c++14 flag to it.
+#  endif
+#endif
+
diff --git a/thrust/detail/cstdint.h b/thrust/detail/cstdint.h
index 248390a52..f41e11475 100644
--- a/thrust/detail/cstdint.h
+++ b/thrust/detail/cstdint.h
@@ -16,12 +16,16 @@
 
 #pragma once
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+#include <thrust/detail/config.h>
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG) || \
+    (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_INTEL)
 #include <stdint.h>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -75,5 +79,5 @@ typedef divine_intptr_t<>::type   intptr_t;
 typedef divine_uintptr_t<>::type  uintptr_t;
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/dependencies_aware_execution_policy.h b/thrust/detail/dependencies_aware_execution_policy.h
new file mode 100644
index 000000000..a7567a3fa
--- /dev/null
+++ b/thrust/detail/dependencies_aware_execution_policy.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <tuple>
+
+#include <thrust/detail/execute_with_dependencies.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template<template<typename> class ExecutionPolicyCRTPBase>
+struct dependencies_aware_execution_policy
+{
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(Dependencies&& ...dependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(dependencies))... };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(std::tuple<Dependencies...>& dependencies) const
+    {
+        return { capture_as_dependency(dependencies) };
+    }
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    after(std::tuple<Dependencies...>&& dependencies) const
+    {
+        return { capture_as_dependency(std::move(dependencies)) };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(Dependencies&& ...dependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(dependencies))... };
+    }
+
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(std::tuple<Dependencies...>& dependencies) const
+    {
+        return { capture_as_dependency(dependencies) };
+    }
+    template<typename ...Dependencies>
+    __host__
+    thrust::detail::execute_with_dependencies<
+        ExecutionPolicyCRTPBase,
+        Dependencies...
+    >
+    rebind_after(std::tuple<Dependencies...>&& dependencies) const
+    {
+        return { capture_as_dependency(std::move(dependencies)) };
+    }
+};
+
+} // end detail
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/detail/device_delete.inl b/thrust/detail/device_delete.inl
index f1a67f91b..87f73aad9 100644
--- a/thrust/detail/device_delete.inl
+++ b/thrust/detail/device_delete.inl
@@ -14,17 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_delete.inl
- *  \brief Inline file for device_delete.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/device_delete.h>
 #include <thrust/device_free.h>
 #include <thrust/detail/allocator/destroy_range.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -43,5 +41,4 @@ template<typename T>
   thrust::device_free(ptr);
 } // end device_delete()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_free.inl b/thrust/detail/device_free.inl
index 7a1b6c123..806802e16 100644
--- a/thrust/detail/device_free.inl
+++ b/thrust/detail/device_free.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_free.inl
- *  \brief Inline file for device_free.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/device_free.h>
@@ -25,8 +22,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 void device_free(thrust::device_ptr<void> ptr)
 {
@@ -40,5 +36,4 @@ void device_free(thrust::device_ptr<void> ptr)
   thrust::free(s, ptr);
 } // end device_free()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_malloc.inl b/thrust/detail/device_malloc.inl
index 938c3c807..f4222f51d 100644
--- a/thrust/detail/device_malloc.inl
+++ b/thrust/detail/device_malloc.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc.inl
- *  \brief Inline file for device_malloc.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/device_malloc.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 thrust::device_ptr<void> device_malloc(const std::size_t n)
 {
@@ -55,6 +50,4 @@ template<typename T>
   return thrust::device_ptr<T>(thrust::malloc<T>(s,n).get());
 } // end device_malloc()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_new.inl b/thrust/detail/device_new.inl
index 2551badb4..c66e2cbff 100644
--- a/thrust/detail/device_new.inl
+++ b/thrust/detail/device_new.inl
@@ -14,17 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_new.inl
- *  \brief Inline file for device_new.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/device_new.h>
 #include <thrust/device_malloc.h>
 #include <thrust/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename T>
   device_ptr<T> device_new(device_ptr<void> p,
@@ -45,7 +42,7 @@ template<typename T>
 
   // run copy constructors at p here
   thrust::uninitialized_fill(result, result + n, exemplar);
-  
+
   return result;
 } // end device_new()
 
@@ -56,5 +53,4 @@ template<typename T>
   return device_new<T>(thrust::device_malloc<T>(n));
 } // end device_new()
 
-} // thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_ptr.inl b/thrust/detail/device_ptr.inl
index 6171b2103..361c61f33 100644
--- a/thrust/detail/device_ptr.inl
+++ b/thrust/detail/device_ptr.inl
@@ -14,26 +14,25 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_ptr.inl
- *  \brief Inline file for device_ptr.h.
- */
+#pragma once
 
 #include <thrust/device_ptr.h>
 #include <thrust/device_reference.h>
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename T>
+  __host__ __device__
   device_ptr<T> device_pointer_cast(T *ptr)
 {
   return device_ptr<T>(ptr);
 } // end device_pointer_cast()
 
 template<typename T>
+  __host__ __device__
   device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr)
 {
   return ptr;
@@ -61,5 +60,5 @@ template<typename T>
 
 
 } // end namespace detail
-} // end namespace thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_reference.inl b/thrust/detail/device_reference.inl
deleted file mode 100644
index 1f101f4ee..000000000
--- a/thrust/detail/device_reference.inl
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file device_reference.inl
- *  \brief Inline file for device_reference.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/device_reference.h>
-
-namespace thrust
-{
-
-template<typename T>
-  template<typename OtherT>
-    device_reference<T> &
-      device_reference<T>
-        ::operator=(const device_reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end operator=()
-
-template<typename T>
-  device_reference<T> &
-    device_reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end operator=()
-
-template<typename T>
-__host__ __device__
-void swap(device_reference<T> &a, device_reference<T> &b)
-{
-  a.swap(b);
-} // end swap()
-
-} // end thrust
-
diff --git a/thrust/detail/dispatch/is_trivial_copy.h b/thrust/detail/dispatch/is_trivial_copy.h
deleted file mode 100644
index 691b1df20..000000000
--- a/thrust/detail/dispatch/is_trivial_copy.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file trivial_copy.h
- *  \brief Device implementations for copying memory between host and device.
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/detail/is_trivial_iterator.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace dispatch
-{
-
-
-// a trivial copy's iterator's value_types match,
-// the iterators themselves are normal_iterators
-// and the ToIterator's value_type has_trivial_assign
-template<typename FromIterator, typename ToIterator>
-  struct is_trivial_copy :
-    integral_constant<
-      bool,
-      is_same<
-        typename thrust::iterator_value<FromIterator>::type,
-        typename thrust::iterator_value<ToIterator>::type
-      >::value
-      && is_trivial_iterator<FromIterator>::value
-      && is_trivial_iterator<ToIterator>::value
-      && has_trivial_assign<typename thrust::iterator_value<ToIterator>::type>::value
-    > {};
-
-} // end namespace dispatch
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/thrust/detail/distance.inl b/thrust/detail/distance.inl
index 5732a9c25..6702c2b6f 100644
--- a/thrust/detail/distance.inl
+++ b/thrust/detail/distance.inl
@@ -14,19 +14,16 @@
  *  limitations under the License.
  */
 
-
-/*! \file distance.inl
- *  \brief Inline file for distance.h
- */
+#pragma once
 
 #include <thrust/advance.h>
+#include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
+__thrust_exec_check_disable__
 template<typename InputIterator>
 inline __host__ __device__
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -35,6 +32,4 @@ inline __host__ __device__
   return thrust::system::detail::generic::distance(first, last);
 } // end distance()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/equal.inl b/thrust/detail/equal.inl
index 08bfbab0b..e21ddfa5a 100644
--- a/thrust/detail/equal.inl
+++ b/thrust/detail/equal.inl
@@ -14,20 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file equal.inl
- *  \brief Inline file for equal.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/equal.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/generic/equal.h>
 #include <thrust/system/detail/adl/equal.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename System, typename InputIterator1, typename InputIterator2>
@@ -65,7 +61,7 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
 }
 
 
-template <typename InputIterator1, typename InputIterator2, 
+template <typename InputIterator1, typename InputIterator2,
           typename BinaryPredicate>
 bool equal(InputIterator1 first1, InputIterator1 last1,
            InputIterator2 first2, BinaryPredicate binary_pred)
@@ -81,6 +77,4 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
   return thrust::equal(select_system(system1,system2), first1, last1, first2, binary_pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/event_error.h b/thrust/detail/event_error.h
new file mode 100644
index 000000000..b928e0650
--- /dev/null
+++ b/thrust/detail/event_error.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/// \file thrust/detail/event_error.h
+/// \brief \c thrust::future and thrust::future error handling types and codes.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/error_code.h>
+
+#include <stdexcept>
+
+THRUST_NAMESPACE_BEGIN
+
+enum class event_errc
+{
+  unknown_event_error
+, no_state
+, no_content
+, last_event_error
+};
+
+/// \return <tt>error_code(static_cast<int>(e), event_category())</tt>
+inline error_code make_error_code(event_errc e);
+
+/// \return <tt>error_condition(static_cast<int>(e), event_category())</tt>.
+inline error_condition make_error_condition(event_errc e);
+
+struct event_error_category : error_category
+{
+  event_error_category() = default;
+
+  virtual char const* name() const
+  {
+    return "event";
+  }
+
+  virtual std::string message(int ev) const
+  {
+    switch (static_cast<event_errc>(ev))
+    {
+      case event_errc::no_state:
+      {
+        return "no_state: an operation that requires an event or future to have "
+               "a stream or content has been performed on a event or future "
+               "without either, e.g. a moved-from or default constructed event "
+               "or future (an event or future may have been consumed more than "
+               "once)";
+      }
+      case event_errc::no_content:
+      {
+        return "no_content: an operation that requires a future to have content "
+               "has been performed on future without any, e.g. a moved-from, "
+               "default constructed, or `thrust::new_stream` constructed future "
+               "(a future may have been consumed more than once)";
+      }
+      default:
+      {
+        return "unknown_event_error: an unknown error with a future "
+               "object has occurred";
+      }
+    };
+  }
+
+  virtual error_condition default_error_condition(int ev) const
+  {
+    if (
+         event_errc::last_event_error
+         >
+         static_cast<event_errc>(ev)
+       )
+      return make_error_condition(static_cast<event_errc>(ev));
+
+    return system_category().default_error_condition(ev);
+  }
+};
+
+/// Obtains a reference to the static error category object for the errors
+/// related to futures and promises. The object is required to override the
+/// virtual function error_category::name() to return a pointer to the string
+/// "event". It is used to identify error codes provided in the
+/// exceptions of type event_error.
+inline error_category const& event_category()
+{
+  static const event_error_category result;
+  return result;
+}
+
+namespace system
+{
+/// Specialization of \p is_error_code_enum for \p event_errc.
+template<> struct is_error_code_enum<event_errc> : true_type {};
+} // end system
+
+/// \return <tt>error_code(static_cast<int>(e), event_category())</tt>
+inline error_code make_error_code(event_errc e)
+{
+  return error_code(static_cast<int>(e), event_category());
+}
+
+/// \return <tt>error_condition(static_cast<int>(e), event_category())</tt>.
+inline error_condition make_error_condition(event_errc e)
+{
+  return error_condition(static_cast<int>(e), event_category());
+}
+
+struct event_error : std::logic_error
+{
+  __host__
+  explicit event_error(error_code ec)
+    : std::logic_error(ec.message()), ec_(ec)
+  {}
+
+  __host__
+  explicit event_error(event_errc e)
+    : event_error(make_error_code(e))
+  {}
+
+  __host__
+  error_code const& code() const noexcept
+  {
+    return ec_;
+  }
+
+  __host__
+  virtual ~event_error() noexcept {}
+
+private:
+  error_code ec_;
+};
+
+inline bool operator==(event_error const& lhs, event_error const& rhs) noexcept
+{
+  return lhs.code() == rhs.code();
+}
+
+inline bool operator<(event_error const& lhs, event_error const& rhs) noexcept
+{
+  return lhs.code() < rhs.code();
+}
+
+THRUST_NAMESPACE_END
+
+#endif // C++14
+
diff --git a/thrust/detail/execute_with_allocator.h b/thrust/detail/execute_with_allocator.h
index 39ac84fb3..430fe739c 100644
--- a/thrust/detail/execute_with_allocator.h
+++ b/thrust/detail/execute_with_allocator.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,80 +17,133 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#include <thrust/detail/execute_with_allocator_fwd.h>
+#include <thrust/pair.h>
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/pair.h>
+#include <thrust/detail/integer_math.h>
+
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
 namespace detail
 {
 
-template<typename ToPointer, typename FromPointer>
-__host__ __device__
-ToPointer reinterpret_pointer_cast(FromPointer ptr)
+template <
+    typename T
+  , typename Allocator
+  , template <typename> class BaseSystem
+>
+__host__
+thrust::pair<T*, std::ptrdiff_t>
+get_temporary_buffer(
+    thrust::detail::execute_with_allocator<Allocator, BaseSystem>& system
+  , std::ptrdiff_t n
+    )
 {
-  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
-  return ToPointer(reinterpret_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::void_pointer                        void_pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+
+  // How many elements of type value_type do we need to accommodate n elements
+  // of type T?
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements);
+
+  // Return the pointer and the number of elements of type T allocated.
+  return thrust::make_pair(thrust::reinterpret_pointer_cast<T*>(ptr),n);
 }
 
+template <
+    typename Pointer
+  , typename Allocator
+  , template <typename> class BaseSystem
+>
+__host__
+void
+return_temporary_buffer(
+    thrust::detail::execute_with_allocator<Allocator, BaseSystem>& system
+  , Pointer p
+  , std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
+}
 
-template<typename Allocator, template <typename> class BaseSystem>
-  struct execute_with_allocator
-    : BaseSystem<execute_with_allocator<Allocator, BaseSystem> >
+#if THRUST_CPP_DIALECT >= 2011
+
+template <
+    typename T,
+    template <typename> class BaseSystem,
+    typename Allocator,
+    typename ...Dependencies
+>
+__host__
+thrust::pair<T*, std::ptrdiff_t>
+get_temporary_buffer(
+    thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system,
+    std::ptrdiff_t n
+    )
 {
-  typedef BaseSystem<
-    execute_with_allocator<Allocator, BaseSystem>
-  > super_t;
-
-  Allocator &m_alloc;
-
-  __host__ __device__
-  execute_with_allocator(const super_t &super, Allocator &alloc)
-    : super_t(super),
-      m_alloc(alloc)
-  {}
-
-  __host__ __device__
-  execute_with_allocator(Allocator &alloc)
-    : m_alloc(alloc)
-  {}
-
-  template<typename T>
-  __host__ __device__
-    friend thrust::pair<T*,std::ptrdiff_t>
-      get_temporary_buffer(execute_with_allocator &system, std::ptrdiff_t n)
-  {
-    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
-    typedef typename alloc_traits::void_pointer                  void_pointer;
-    typedef typename alloc_traits::size_type                     size_type;
-    typedef typename alloc_traits::value_type                    value_type;
-
-    // how many elements of type value_type do we need to accomodate n elements of type T?
-    size_type num_elements = thrust::detail::util::divide_ri(sizeof(T) * n, sizeof(value_type));
-
-    // allocate that many
-    void_pointer ptr = alloc_traits::allocate(system.m_alloc, num_elements);
-
-    // return the pointer and the number of elements of type T allocated
-    return thrust::make_pair(thrust::detail::reinterpret_pointer_cast<T*>(ptr),n);
-  }
-
-  template<typename Pointer>
-    friend void return_temporary_buffer(execute_with_allocator &system, Pointer p)
-  {
-    typedef typename thrust::detail::allocator_traits<Allocator> alloc_traits;
-    typedef typename alloc_traits::pointer                       pointer;
-
-    // return the pointer to the allocator
-    pointer to_ptr = thrust::detail::reinterpret_pointer_cast<pointer>(p);
-    alloc_traits::deallocate(system.m_alloc, to_ptr, 0);
-  }
-};
-
-
-} // end detail
-} // end thrust
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::void_pointer                        void_pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+
+  // How many elements of type value_type do we need to accommodate n elements
+  // of type T?
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  void_pointer ptr = alloc_traits::allocate(system.get_allocator(), num_elements);
+
+  // Return the pointer and the number of elements of type T allocated.
+  return thrust::make_pair(thrust::reinterpret_pointer_cast<T*>(ptr),n);
+}
+
+template <
+    typename Pointer,
+    template <typename> class BaseSystem,
+    typename Allocator,
+    typename ...Dependencies
+>
+__host__
+void
+return_temporary_buffer(
+    thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system,
+    Pointer p,
+    std::ptrdiff_t n
+    )
+{
+  typedef typename thrust::detail::remove_reference<Allocator>::type naked_allocator;
+  typedef typename thrust::detail::allocator_traits<naked_allocator> alloc_traits;
+  typedef typename alloc_traits::pointer                             pointer;
+  typedef typename alloc_traits::size_type                           size_type;
+  typedef typename alloc_traits::value_type                          value_type;
+  typedef typename thrust::detail::pointer_traits<Pointer>::element_type T;
+
+  size_type num_elements = divide_ri(sizeof(T) * n, sizeof(value_type));
+
+  pointer to_ptr = thrust::reinterpret_pointer_cast<pointer>(p);
+  alloc_traits::deallocate(system.get_allocator(), to_ptr, num_elements);
+}
+
+#endif
+
+} // namespace detail
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/execute_with_allocator_fwd.h b/thrust/detail/execute_with_allocator_fwd.h
new file mode 100644
index 000000000..1d5899a7d
--- /dev/null
+++ b/thrust/detail/execute_with_allocator_fwd.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/execute_with_dependencies.h>
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+template <typename Allocator, template <typename> class BaseSystem>
+struct execute_with_allocator
+  : BaseSystem<execute_with_allocator<Allocator, BaseSystem> >
+{
+private:
+  typedef BaseSystem<execute_with_allocator<Allocator, BaseSystem> > super_t;
+
+  Allocator alloc;
+
+public:
+  __host__ __device__
+  execute_with_allocator(super_t const& super, Allocator alloc_)
+    : super_t(super), alloc(alloc_)
+  {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  execute_with_allocator(Allocator alloc_)
+    : alloc(alloc_)
+  {}
+
+  typename remove_reference<Allocator>::type& get_allocator() { return alloc; }
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(Dependencies&& ...dependencies) const
+  {
+    return { alloc, capture_as_dependency(THRUST_FWD(dependencies))... };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(std::tuple<Dependencies...>& dependencies) const
+  {
+      return { alloc, capture_as_dependency(dependencies) };
+  }
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  after(std::tuple<Dependencies...>&& dependencies) const
+  {
+      return { alloc, capture_as_dependency(std::move(dependencies)) };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(Dependencies&& ...dependencies) const
+  {
+    return { alloc, capture_as_dependency(THRUST_FWD(dependencies))... };
+  }
+
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(std::tuple<Dependencies...>& dependencies) const
+  {
+      return { alloc, capture_as_dependency(dependencies) };
+  }
+  template<typename ...Dependencies>
+  __host__
+  execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>
+  rebind_after(std::tuple<Dependencies...>&& dependencies) const
+  {
+      return { alloc, capture_as_dependency(std::move(dependencies)) };
+  }
+#endif
+};
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/execute_with_dependencies.h b/thrust/detail/execute_with_dependencies.h
new file mode 100644
index 000000000..ec54010b0
--- /dev/null
+++ b/thrust/detail/execute_with_dependencies.h
@@ -0,0 +1,267 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <tuple>
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+struct capture_as_dependency_fn
+{
+  template<typename Dependency>
+  auto operator()(Dependency&& dependency) const
+  THRUST_DECLTYPE_RETURNS(capture_as_dependency(THRUST_FWD(dependency)))
+};
+
+// Default implementation: universal forwarding.
+template<typename Dependency>
+auto capture_as_dependency(Dependency&& dependency)
+THRUST_DECLTYPE_RETURNS(THRUST_FWD(dependency))
+
+template<typename... Dependencies>
+auto capture_as_dependency(std::tuple<Dependencies...>& dependencies)
+THRUST_DECLTYPE_RETURNS(
+  tuple_for_each(THRUST_FWD(dependencies), capture_as_dependency_fn{})
+)
+
+template<template<typename> class BaseSystem, typename... Dependencies>
+struct execute_with_dependencies
+    : BaseSystem<execute_with_dependencies<BaseSystem, Dependencies...>>
+{
+private:
+    using super_t = BaseSystem<execute_with_dependencies<BaseSystem, Dependencies...>>;
+
+    std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+
+public:
+    __host__
+    execute_with_dependencies(super_t const &super, Dependencies && ...dependencies)
+        : super_t(super), dependencies(std::forward<Dependencies>(dependencies)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(super_t const &super, UDependencies && ...deps)
+        : super_t(super), dependencies(THRUST_FWD(deps)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(UDependencies && ...deps)
+        : dependencies(THRUST_FWD(deps)...)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(super_t const &super, std::tuple<UDependencies...>&& deps)
+        : super_t(super), dependencies(std::move(deps))
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_dependencies(std::tuple<UDependencies...>&& deps)
+        : dependencies(std::move(deps))
+    {
+    }
+
+    std::tuple<remove_cvref_t<Dependencies>...>
+    __host__
+    extract_dependencies() 
+    {
+        return std::move(dependencies);
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(UDependencies&& ...udependencies) const
+    {
+        return { capture_as_dependency(THRUST_FWD(udependencies))... };
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>& udependencies) const
+    {
+        return { capture_as_dependency(udependencies) };
+    }
+    template<typename ...UDependencies>
+    __host__
+    execute_with_dependencies<BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>&& udependencies) const
+    {
+        return { capture_as_dependency(std::move(udependencies)) };
+    }
+};
+
+template<
+    typename Allocator,
+    template<typename> class BaseSystem,
+    typename... Dependencies
+>
+struct execute_with_allocator_and_dependencies
+    : BaseSystem<
+        execute_with_allocator_and_dependencies<
+            Allocator,
+            BaseSystem,
+            Dependencies...
+        >
+    >
+{
+private:
+    using super_t = BaseSystem<
+        execute_with_allocator_and_dependencies<
+            Allocator,
+            BaseSystem,
+            Dependencies...
+        >
+    >;
+
+    std::tuple<remove_cvref_t<Dependencies>...> dependencies;
+    Allocator alloc;
+
+public:
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(super_t const &super, Allocator a, UDependencies && ...deps)
+        : super_t(super), dependencies(THRUST_FWD(deps)...), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(Allocator a, UDependencies && ...deps)
+        : dependencies(THRUST_FWD(deps)...), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(super_t const &super, Allocator a, std::tuple<UDependencies...>&& deps)
+        : super_t(super), dependencies(std::move(deps)), alloc(a)
+    {
+    }
+
+    template <typename... UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies(Allocator a, std::tuple<UDependencies...>&& deps)
+        : dependencies(std::move(deps)), alloc(a)
+    {
+    }
+
+    std::tuple<remove_cvref_t<Dependencies>...>
+    __host__
+    extract_dependencies() 
+    {
+        return std::move(dependencies);
+    }
+
+    __host__
+    typename std::add_lvalue_reference<Allocator>::type
+    get_allocator()
+    {
+        return alloc;
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(UDependencies&& ...udependencies) const
+    {
+        return { alloc, capture_as_dependency(THRUST_FWD(udependencies))... };
+    }
+
+    // Rebinding.
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>& udependencies) const
+    {
+        return { alloc, capture_as_dependency(udependencies) };
+    }
+    template<typename ...UDependencies>
+    __host__
+    execute_with_allocator_and_dependencies<Allocator, BaseSystem, UDependencies...>
+    rebind_after(std::tuple<UDependencies...>&& udependencies) const
+    {
+        return { alloc, capture_as_dependency(std::move(udependencies)) };
+    }
+};
+
+template<template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>&& system)
+{
+    return std::move(system).extract_dependencies();
+}
+template<template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_dependencies<BaseSystem, Dependencies...>& system)
+{
+    return std::move(system).extract_dependencies();
+}
+
+template<typename Allocator, template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>&& system)
+{
+    return std::move(system).extract_dependencies();
+}
+template<typename Allocator, template<typename> class BaseSystem, typename ...Dependencies>
+__host__
+std::tuple<remove_cvref_t<Dependencies>...>
+extract_dependencies(thrust::detail::execute_with_allocator_and_dependencies<Allocator, BaseSystem, Dependencies...>& system)
+{
+    return std::move(system).extract_dependencies();
+}
+
+template<typename System>
+__host__
+std::tuple<>
+extract_dependencies(System &&)
+{
+    return std::tuple<>{};
+}
+
+} // end detail
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/detail/execution_policy.h b/thrust/detail/execution_policy.h
index e17332a40..dcc11a770 100644
--- a/thrust/detail/execution_policy.h
+++ b/thrust/detail/execution_policy.h
@@ -18,11 +18,12 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
+struct execution_policy_marker {};
 
 // execution_policy_base serves as a guard against
 // inifinite recursion in thrust entry points:
@@ -38,41 +39,38 @@ namespace detail
 // foo is not recursive when
 // 1. DerivedPolicy is derived from thrust::execution_policy below
 // 2. generic::foo takes thrust::execution_policy as a parameter
-template<typename DerivedPolicy> struct execution_policy_base {};
+template<typename DerivedPolicy>
+struct execution_policy_base : execution_policy_marker {};
 
 
 template<typename DerivedPolicy>
-__host__ __device__
-inline execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
+constexpr __host__ __device__
+execution_policy_base<DerivedPolicy> &strip_const(const execution_policy_base<DerivedPolicy> &x)
 {
   return const_cast<execution_policy_base<DerivedPolicy>&>(x);
 }
 
 
 template<typename DerivedPolicy>
-__host__ __device__
-inline DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
+constexpr __host__ __device__
+DerivedPolicy &derived_cast(execution_policy_base<DerivedPolicy> &x)
 {
   return static_cast<DerivedPolicy&>(x);
 }
 
 
 template<typename DerivedPolicy>
-__host__ __device__
-inline const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
+constexpr __host__ __device__
+const DerivedPolicy &derived_cast(const execution_policy_base<DerivedPolicy> &x)
 {
   return static_cast<const DerivedPolicy&>(x);
 }
 
-
 } // end detail
 
-
 template<typename DerivedPolicy>
   struct execution_policy
     : thrust::detail::execution_policy_base<DerivedPolicy>
 {};
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/extrema.inl b/thrust/detail/extrema.inl
index 3f60743e6..2c1750e7d 100644
--- a/thrust/detail/extrema.inl
+++ b/thrust/detail/extrema.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/extrema.h>
@@ -22,9 +23,7 @@
 #include <thrust/system/detail/generic/extrema.h>
 #include <thrust/system/detail/adl/extrema.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator>
@@ -141,7 +140,7 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
 
 
 template <typename ForwardIterator>
-thrust::pair<ForwardIterator,ForwardIterator> 
+thrust::pair<ForwardIterator,ForwardIterator>
 minmax_element(ForwardIterator first, ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
@@ -155,7 +154,7 @@ minmax_element(ForwardIterator first, ForwardIterator last)
 
 
 template <typename ForwardIterator, typename BinaryPredicate>
-thrust::pair<ForwardIterator,ForwardIterator> 
+thrust::pair<ForwardIterator,ForwardIterator>
 minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp)
 {
   using thrust::system::detail::generic::select_system;
@@ -167,6 +166,4 @@ minmax_element(ForwardIterator first, ForwardIterator last, BinaryPredicate comp
   return thrust::minmax_element(select_system(system), first, last, comp);
 } // end minmax_element()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/fill.inl b/thrust/detail/fill.inl
index 6e957ca1f..e68672bbe 100644
--- a/thrust/detail/fill.inl
+++ b/thrust/detail/fill.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file fill.inl
- *  \brief Inline file for fill.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +24,7 @@
 #include <thrust/system/detail/generic/fill.h>
 #include <thrust/system/detail/adl/fill.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
@@ -86,6 +83,4 @@ __host__ __device__
   return thrust::fill_n(select_system(system), first, n, value);
 } // end fill()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/find.inl b/thrust/detail/find.inl
index f42ff4650..5b494f61a 100644
--- a/thrust/detail/find.inl
+++ b/thrust/detail/find.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file find.inl
- *  \brief Inline file for find.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/detail/adl/find.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename T>
@@ -74,11 +69,11 @@ InputIterator find(InputIterator first,
                    const T& value)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find(select_system(system), first, last, value);
 }
 
@@ -88,11 +83,11 @@ InputIterator find_if(InputIterator first,
                       Predicate pred)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find_if(select_system(system), first, last, pred);
 }
 
@@ -102,14 +97,12 @@ InputIterator find_if_not(InputIterator first,
                           Predicate pred)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<InputIterator>::type System;
-  
+
   System system;
-  
+
   return thrust::find_if_not(select_system(system), first, last, pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/for_each.inl b/thrust/detail/for_each.inl
index 3365ce2e0..4ba39c71a 100644
--- a/thrust/detail/for_each.inl
+++ b/thrust/detail/for_each.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/for_each.h>
@@ -26,10 +23,9 @@
 #include <thrust/system/detail/generic/for_each.h>
 #include <thrust/system/detail/adl/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy,
          typename InputIterator,
          typename UnaryFunction>
@@ -58,7 +54,7 @@ InputIterator for_each(InputIterator first,
   return thrust::for_each(select_system(system), first, last, f);
 } // end for_each()
 
-__thrust_exec_check_disable__ 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename Size, typename UnaryFunction>
 __host__ __device__
   InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
@@ -87,6 +83,4 @@ InputIterator for_each_n(InputIterator first,
   return thrust::for_each_n(select_system(system), first, n, f);
 } // end for_each_n()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/function.h b/thrust/detail/function.h
index f1f9e9c94..ba20507a5 100644
--- a/thrust/detail/function.h
+++ b/thrust/detail/function.h
@@ -19,85 +19,143 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/raw_reference_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
-
-template<typename Function, typename Result>
-  struct wrapped_function
+template <typename Function, typename Result>
+struct wrapped_function
 {
   // mutable because Function::operator() might be const
   mutable Function m_f;
 
   inline __host__ __device__
   wrapped_function()
-    : m_f()
+      : m_f()
   {}
 
   inline __host__ __device__
-  wrapped_function(const Function &f)
-    : m_f(f)
+  wrapped_function(const Function& f)
+      : m_f(f)
   {}
 
   __thrust_exec_check_disable__
-  template<typename Argument>
-  inline __host__ __device__
-    Result operator()(Argument &x) const
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument& x) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
     return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument>
-    inline __host__ __device__ Result operator()(const Argument &x) const
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument& x) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
     return static_cast<Result>(m_f(thrust::raw_reference_cast(x)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument1& x, Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument1& x, Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(const Argument1 &x, const Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(const Argument1& x, const Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 
   __thrust_exec_check_disable__
-  template<typename Argument1, typename Argument2>
-    inline __host__ __device__ Result operator()(Argument1 &x, const Argument2 &y) const
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  Result operator()(Argument1& x, const Argument2& y) const
   {
-    // we static cast to Result to handle void Result without error
-    // in case Function's result is non-void
-    return static_cast<Result>(m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y)));
+    return static_cast<Result>(m_f(thrust::raw_reference_cast(x),
+                                   thrust::raw_reference_cast(y)));
   }
 }; // end wrapped_function
 
+// Specialize for void return types:
+template <typename Function>
+struct wrapped_function<Function, void>
+{
+  // mutable because Function::operator() might be const
+  mutable Function m_f;
+  inline __host__ __device__
+  wrapped_function()
+    : m_f()
+  {}
+
+  inline __host__ __device__
+  wrapped_function(const Function& f)
+    : m_f(f)
+  {}
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument& x) const
+  {
+    m_f(thrust::raw_reference_cast(x));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument1& x, Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(const Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+  __thrust_exec_check_disable__
+  template <typename Argument1, typename Argument2>
+  __thrust_forceinline__ __host__ __device__
+  void operator()(Argument1& x, const Argument2& y) const
+  {
+    m_f(thrust::raw_reference_cast(x), thrust::raw_reference_cast(y));
+  }
+}; // end wrapped_function
 
-} // end detail
-} // end thrust
+} // namespace detail
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional.inl b/thrust/detail/functional.inl
index 0cdec0b68..bdf8e0415 100644
--- a/thrust/detail/functional.inl
+++ b/thrust/detail/functional.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,10 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -107,16 +110,17 @@ template<typename Result, typename Argument1, typename Argument2>
 }; // end binary_traits
 
 template<typename Predicate>
+  __host__ __device__
   unary_negate<Predicate> not1(const Predicate &pred)
 {
   return unary_negate<Predicate>(pred);
 } // end not1()
 
 template<typename BinaryPredicate>
+  __host__ __device__
   binary_negate<BinaryPredicate> not2(const BinaryPredicate &pred)
 {
   return binary_negate<BinaryPredicate>(pred);
 } // end not2()
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional/actor.h b/thrust/detail/functional/actor.h
index 39e29ec9b..cee0770a4 100644
--- a/thrust/detail/functional/actor.h
+++ b/thrust/detail/functional/actor.h
@@ -30,15 +30,23 @@
 #include <thrust/detail/functional/value.h>
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/assignment_operator.h>
-#include <thrust/detail/type_traits/result_of.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
+// eval_ref<T> is
+// - T when T is a subclass of thrust::reference
+// - T& otherwise
+// This is used to let thrust::references pass through actor evaluations.
+template <typename T>
+using eval_ref = typename std::conditional<
+  thrust::detail::is_wrapped_reference<T>::value, T, T&>::type;
+
 template<typename Action, typename Env>
   struct apply_actor
 {
@@ -52,7 +60,7 @@ template<typename Eval>
   typedef Eval eval_type;
 
   __host__ __device__
-  actor(void);
+  constexpr actor();
 
   __host__ __device__
   actor(const Eval &base);
@@ -61,55 +69,10 @@ template<typename Eval>
   typename apply_actor<eval_type, thrust::null_type >::type
   operator()(void) const;
 
-  template<typename T0>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&> >::type
-  operator()(T0 &_0) const;
-
-  template<typename T0, typename T1>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&> >::type
-  operator()(T0 &_0, T1 &_1) const;
-
-  template<typename T0, typename T1, typename T2>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2) const;
-
-  template<typename T0, typename T1, typename T2, typename T3>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-  __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const;
-
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
+  template <typename... Ts>
   __host__ __device__
-  typename apply_actor<eval_type, thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&> >::type
-  operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const;
+  typename apply_actor<eval_type, thrust::tuple<eval_ref<Ts>...>>::type
+  operator()(Ts&&... ts) const;
 
   template<typename T>
   __host__ __device__
@@ -153,7 +116,7 @@ template<typename T>
 
 // provide specializations for result_of for nullary, unary, and binary invocations of actor
 template<typename Eval>
-  struct result_of<
+  struct result_of_adaptable_function<
     thrust::detail::functional::actor<Eval>()
   >
 {
@@ -164,7 +127,7 @@ template<typename Eval>
 }; // end result_of
 
 template<typename Eval, typename Arg1>
-  struct result_of<
+  struct result_of_adaptable_function<
     thrust::detail::functional::actor<Eval>(Arg1)
   >
 {
@@ -175,7 +138,7 @@ template<typename Eval, typename Arg1>
 }; // end result_of
 
 template<typename Eval, typename Arg1, typename Arg2>
-  struct result_of<
+  struct result_of_adaptable_function<
     thrust::detail::functional::actor<Eval>(Arg1,Arg2)
   >
 {
@@ -186,7 +149,7 @@ template<typename Eval, typename Arg1, typename Arg2>
 }; // end result_of
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/functional/actor.inl>
 
diff --git a/thrust/detail/functional/actor.inl b/thrust/detail/functional/actor.inl
index 7c7c94961..e0bdebbbf 100644
--- a/thrust/detail/functional/actor.inl
+++ b/thrust/detail/functional/actor.inl
@@ -23,13 +23,17 @@
 // Based on Boost.Phoenix v1.2
 // Copyright (c) 2001-2002 Joel de Guzman
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/assignment_operator.h>
 #include <thrust/functional.h>
+#include <thrust/type_traits/logical_metafunctions.h>
 
-namespace thrust
-{
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -37,18 +41,21 @@ namespace functional
 {
 
 template<typename Eval>
-  actor<Eval>
-    ::actor(void)
+  __host__ __device__
+  constexpr actor<Eval>
+    ::actor()
       : eval_type()
 {}
 
 template<typename Eval>
+  __host__ __device__
   actor<Eval>
     ::actor(const Eval &base)
       : eval_type(base)
 {}
 
 template<typename Eval>
+  __host__ __device__
   typename apply_actor<
     typename actor<Eval>::eval_type,
     typename thrust::null_type
@@ -59,128 +66,42 @@ template<typename Eval>
   return eval_type::eval(thrust::null_type());
 } // end basic_environment::operator()
 
-template<typename Eval>
-  template<typename T0>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0) const
-{
-  return eval_type::eval(thrust::tie(_0));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1) const
-{
-  return eval_type::eval(thrust::tie(_0,_1));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2));
-} // end basic_environment::operator()
+// actor::operator() needs to construct a tuple of references to its
+// arguments. To make this work with thrust::reference<T>, we need to
+// detect thrust proxy references and store them as T rather than T&.
+// This check ensures that the forwarding references passed into
+// actor::operator() are either:
+// - T&& if and only if T is a thrust::reference<U>, or
+// - T& for any other types.
+// This struct provides a nicer diagnostic for when these conditions aren't
+// met.
+template <typename T>
+using actor_check_ref_type =
+  thrust::detail::integral_constant<bool,
+    ( std::is_lvalue_reference<T>::value ||
+      thrust::detail::is_wrapped_reference<T>::value )>;
+
+template <typename... Ts>
+using actor_check_ref_types =
+  thrust::conjunction<actor_check_ref_type<Ts>...>;
 
 template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3) const
+template<typename... Ts>
+__host__ __device__
+typename apply_actor<typename actor<Eval>::eval_type,
+                     thrust::tuple<eval_ref<Ts>...>>::type
+actor<Eval>::operator()(Ts&&... ts) const
 {
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8));
-} // end basic_environment::operator()
-
-template<typename Eval>
-  template<typename T0, typename T1, typename T2, typename T3, typename T4, typename T5, typename T6, typename T7, typename T8, typename T9>
-    typename apply_actor<
-      typename actor<Eval>::eval_type,
-      typename thrust::tuple<T0&,T1&,T2&,T3&,T4&,T5&,T6&,T7&,T8&,T9&>
-    >::type
-      actor<Eval>
-        ::operator()(T0 &_0, T1 &_1, T2 &_2, T3 &_3, T4 &_4, T5 &_5, T6 &_6, T7 &_7, T8 &_8, T9 &_9) const
-{
-  return eval_type::eval(thrust::tie(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9));
-} // end basic_environment::operator()
+  static_assert(actor_check_ref_types<Ts...>::value,
+                "Actor evaluations only support rvalue references to "
+                "thrust::reference subclasses.");
+  using tuple_type = thrust::tuple<eval_ref<Ts>...>;
+  return eval_type::eval(tuple_type(THRUST_FWD(ts)...));
+} // end actor<Eval>::operator()
 
 template<typename Eval>
   template<typename T>
+    __host__ __device__
     typename assign_result<Eval,T>::type
       actor<Eval>
         ::operator=(const T& _1) const
@@ -190,5 +111,4 @@ template<typename Eval>
 
 } // end functional
 } // end detail
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/functional/argument.h b/thrust/detail/functional/argument.h
index 88b48a6d2..aac29f537 100644
--- a/thrust/detail/functional/argument.h
+++ b/thrust/detail/functional/argument.h
@@ -28,8 +28,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -59,7 +58,7 @@ template<unsigned int i>
     };
 
     __host__ __device__
-    argument(void){}
+    constexpr argument(){}
 
     template<typename Env>
     __host__ __device__
@@ -71,5 +70,5 @@ template<unsigned int i>
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/composite.h b/thrust/detail/functional/composite.h
index 6cf095bf1..41ee74739 100644
--- a/thrust/detail/functional/composite.h
+++ b/thrust/detail/functional/composite.h
@@ -25,11 +25,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/functional/actor.h>
 #include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -159,5 +160,5 @@ __host__ __device__
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/arithmetic_operators.h b/thrust/detail/functional/operators/arithmetic_operators.h
index 6628917d6..443d307cb 100644
--- a/thrust/detail/functional/operators/arithmetic_operators.h
+++ b/thrust/detail/functional/operators/arithmetic_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,49 +32,57 @@ template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<thrust::negate>,
+    transparent_unary_operator<thrust::negate<>>,
     actor<Eval>
   >
 >
 __host__ __device__
 operator-(const actor<Eval> &_1)
 {
-  return compose(unary_operator<thrust::negate>(), _1);
+  return compose(transparent_unary_operator<thrust::negate<>>(), _1);
 } // end operator-()
 
 // there's no standard unary_plus functional, so roll an ad hoc one here
-template<typename T>
-  struct unary_plus
-    : public thrust::unary_function<T,T>
+struct unary_plus
 {
-  __host__ __device__ T operator()(const T &x) const {return +x;}
-}; // end unary_plus
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(+THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(+THRUST_FWD(t1)))
+  {
+    return +THRUST_FWD(t1);
+  }
+};
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<unary_plus>,
+    transparent_unary_operator<unary_plus>,
     actor<Eval>
   >
 >
 operator+(const actor<Eval> &_1)
 {
-  return compose(unary_operator<unary_plus>(), _1);
+  return compose(transparent_unary_operator<unary_plus>(), _1);
 } // end operator+()
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator+(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -84,14 +91,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator+(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -100,14 +107,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::plus>,
+    transparent_binary_operator<thrust::plus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator+(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::plus>(),
+  return compose(transparent_binary_operator<thrust::plus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+()
@@ -116,14 +123,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator-(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -132,14 +139,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator-(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -148,14 +155,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::minus>,
+    transparent_binary_operator<thrust::minus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator-(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::minus>(),
+  return compose(transparent_binary_operator<thrust::minus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-()
@@ -164,14 +171,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator*(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -180,14 +187,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator*(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -196,14 +203,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::multiplies>,
+    transparent_binary_operator<thrust::multiplies<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator*(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::multiplies>(),
+  return compose(transparent_binary_operator<thrust::multiplies<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*()
@@ -212,14 +219,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator/(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -228,14 +235,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator/(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -244,14 +251,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::divides>,
+    transparent_binary_operator<thrust::divides<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator/(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::divides>(),
+  return compose(transparent_binary_operator<thrust::divides<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/()
@@ -260,14 +267,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator%(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
@@ -276,14 +283,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator%(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<void>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
@@ -292,103 +299,138 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::modulus>,
+    transparent_binary_operator<thrust::modulus<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator%(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::modulus>(),
+  return compose(transparent_binary_operator<thrust::modulus<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%()
 
 // there's no standard prefix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_increment
-    : public thrust::unary_function<T&,T&>
+struct prefix_increment
 {
-  __host__ __device__ T& operator()(T &x) const { return ++x; }
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(++THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(++THRUST_FWD(t1)))
+  {
+    return ++THRUST_FWD(t1);
+  }
 }; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<prefix_increment>,
+    transparent_unary_operator<prefix_increment>,
     actor<Eval>
   >
 >
 operator++(const actor<Eval> &_1)
 {
-  return compose(unary_operator<prefix_increment>(), _1);
+  return compose(transparent_unary_operator<prefix_increment>(), _1);
 } // end operator++()
 
-// there's no standard suffix_increment functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_increment
-    : public thrust::unary_function<T&,T>
+
+// there's no standard postfix_increment functional, so roll an ad hoc one here
+struct postfix_increment
 {
-  __host__ __device__ T operator()(T &x) const { return x++; }
-}; // end suffix_increment
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)++))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)++))
+  {
+    return THRUST_FWD(t1)++;
+  }
+}; // end postfix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<suffix_increment>,
+    transparent_unary_operator<postfix_increment>,
     actor<Eval>
   >
 >
 operator++(const actor<Eval> &_1, int)
 {
-  return compose(unary_operator<suffix_increment>(), _1);
+  return compose(transparent_unary_operator<postfix_increment>(), _1);
 } // end operator++()
 
+
 // there's no standard prefix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct prefix_decrement
-    : public thrust::unary_function<T&,T&>
+struct prefix_decrement
 {
-  __host__ __device__ T& operator()(T &x) const { return --x; }
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(--THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(--THRUST_FWD(t1)))
+  {
+    return --THRUST_FWD(t1);
+  }
 }; // end prefix_decrement
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<prefix_decrement>,
+    transparent_unary_operator<prefix_decrement>,
     actor<Eval>
   >
 >
 operator--(const actor<Eval> &_1)
 {
-  return compose(unary_operator<prefix_decrement>(), _1);
+  return compose(transparent_unary_operator<prefix_decrement>(), _1);
 } // end operator--()
 
-// there's no standard suffix_decrement functional, so roll an ad hoc one here
-template<typename T>
-  struct suffix_decrement
-    : public thrust::unary_function<T&,T>
+
+// there's no standard postfix_decrement functional, so roll an ad hoc one here
+struct postfix_decrement
 {
-  __host__ __device__ T operator()(T &x) const { return x--; }
-}; // end suffix_decrement
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(THRUST_FWD(t1)--))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)--))
+  {
+    return THRUST_FWD(t1)--;
+  }
+}; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<suffix_decrement>,
+    transparent_unary_operator<postfix_decrement>,
     actor<Eval>
   >
 >
 operator--(const actor<Eval> &_1, int)
 {
-  return compose(unary_operator<suffix_decrement>(), _1);
+  return compose(transparent_unary_operator<postfix_decrement>(), _1);
 } // end operator--()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/assignment_operator.h b/thrust/detail/functional/operators/assignment_operator.h
index fb8958f88..870354b6f 100644
--- a/thrust/detail/functional/operators/assignment_operator.h
+++ b/thrust/detail/functional/operators/assignment_operator.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // XXX WAR circular inclusion with this forward declaration
 template<typename,typename,typename> struct binary_function;
@@ -37,19 +36,27 @@ namespace functional
 template<typename> struct as_actor;
 
 // there's no standard assign functional, so roll an ad hoc one here
-template<typename T>
-  struct assign
-    : thrust::binary_function<T&,T,T&>
+struct assign
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs = rhs; }
-}; // end assign
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) = THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) = THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) = THRUST_FWD(t2);
+  }
+};
 
 template<typename Eval, typename T>
   struct assign_result
 {
   typedef actor<
     composite<
-      binary_operator<assign>,
+      transparent_binary_operator<assign>,
       actor<Eval>,
       typename as_actor<T>::type
     >
@@ -61,12 +68,12 @@ template<typename Eval, typename T>
     typename assign_result<Eval,T>::type
       do_assign(const actor<Eval> &_1, const T &_2)
 {
-  return compose(binary_operator<assign>(),
+  return compose(transparent_binary_operator<assign>(),
                  _1,
                  as_actor<T>::convert(_2));
 } // end do_assign()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/bitwise_operators.h b/thrust/detail/functional/operators/bitwise_operators.h
index 796f1701c..065cd1540 100644
--- a/thrust/detail/functional/operators/bitwise_operators.h
+++ b/thrust/detail/functional/operators/bitwise_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator&(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_and>,
+    transparent_binary_operator<bit_and<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_and>(),
+  return compose(transparent_binary_operator<bit_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator|(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator|(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_or>,
+    transparent_binary_operator<bit_or<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator|(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_or>(),
+  return compose(transparent_binary_operator<bit_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|()
@@ -129,14 +128,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator^(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
@@ -145,14 +144,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator^(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
@@ -161,60 +160,77 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::bit_xor>,
+    transparent_binary_operator<bit_xor<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator^(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::bit_xor>(),
+  return compose(transparent_binary_operator<bit_xor<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator^()
 
+
 // there's no standard bit_not functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_not
-    : public thrust::unary_function<T,T>
+struct bit_not
 {
-  __host__ __device__ T operator()(const T &x) const {return ~x;}
-}; // end bit_not
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1) const
+  noexcept(noexcept(~THRUST_FWD(t1)))
+  THRUST_TRAILING_RETURN(decltype(~THRUST_FWD(t1)))
+  {
+    return ~THRUST_FWD(t1);
+  }
+}; // end prefix_increment
 
 template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<bit_not>,
+    transparent_unary_operator<bit_not>,
     actor<Eval>
   >
 >
 __host__ __device__
 operator~(const actor<Eval> &_1)
 {
-  return compose(unary_operator<bit_not>(), _1);
+  return compose(transparent_unary_operator<bit_not>(), _1);
 } // end operator~()
 
 // there's no standard bit_lshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_lshift
-    : public thrust::binary_function<T,T,T>
+struct bit_lshift
 {
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs << rhs;}
-}; // end bit_lshift
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) << THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) << THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) << THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<<(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
@@ -223,14 +239,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<<(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
@@ -239,38 +255,47 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift>,
+    transparent_binary_operator<bit_lshift>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<<(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift>(),
+  return compose(transparent_binary_operator<bit_lshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<()
 
 // there's no standard bit_rshift functional, so roll an ad hoc one here
-template<typename T>
-  struct bit_rshift
-    : public thrust::binary_function<T,T,T>
+struct bit_rshift
 {
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs >> rhs;}
-}; // end bit_rshift
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >> THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) >> THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) >> THRUST_FWD(t2);
+  }
+};
+
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>>(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
@@ -279,14 +304,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>>(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
@@ -295,19 +320,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift>,
+    transparent_binary_operator<bit_rshift>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>>(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift>(),
+  return compose(transparent_binary_operator<bit_rshift>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/compound_assignment_operators.h b/thrust/detail/functional/operators/compound_assignment_operators.h
index cb8d4c105..b5ba77fb4 100644
--- a/thrust/detail/functional/operators/compound_assignment_operators.h
+++ b/thrust/detail/functional/operators/compound_assignment_operators.h
@@ -21,32 +21,40 @@
 #include <thrust/detail/functional/composite.h>
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
-template<typename T>
-  struct plus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard plus_equal functional, so roll an ad hoc one here
+struct plus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs += rhs; }
-}; // end plus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) += THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) += THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) += THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<plus_equal>,
+    transparent_binary_operator<plus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator+=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<plus_equal>(),
+  return compose(transparent_binary_operator<plus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+=()
@@ -55,37 +63,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<plus_equal>,
+    transparent_binary_operator<plus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator+=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<plus_equal>(),
+  return compose(transparent_binary_operator<plus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator+=()
 
-template<typename T>
-  struct minus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard minus_equal functional, so roll an ad hoc one here
+struct minus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T &rhs) const { return lhs -= rhs; }
-}; // end minus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) -= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) -= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) -= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<minus_equal>,
+    transparent_binary_operator<minus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator-=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<minus_equal>(),
+  return compose(transparent_binary_operator<minus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-=()
@@ -94,37 +111,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<minus_equal>,
+    transparent_binary_operator<minus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator-=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<minus_equal>(),
+  return compose(transparent_binary_operator<minus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator-=()
 
-template<typename T>
-  struct multiplies_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard multiplies_equal functional, so roll an ad hoc one here
+struct multiplies_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs *= rhs; }
-}; // end multiplies_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) *= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) *= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) *= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<multiplies_equal>,
+    transparent_binary_operator<multiplies_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator*=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<multiplies_equal>(),
+  return compose(transparent_binary_operator<multiplies_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*=()
@@ -133,37 +159,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<multiplies_equal>,
+    transparent_binary_operator<multiplies_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator*=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<multiplies_equal>(),
+  return compose(transparent_binary_operator<multiplies_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator*=()
 
-template<typename T>
-  struct divides_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard divides_equal functional, so roll an ad hoc one here
+struct divides_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs /= rhs; }
-}; // end divides_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) /= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) /= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) /= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<divides_equal>,
+    transparent_binary_operator<divides_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator/=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<divides_equal>(),
+  return compose(transparent_binary_operator<divides_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/=()
@@ -172,37 +207,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<divides_equal>,
+    transparent_binary_operator<divides_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator/=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<divides_equal>(),
+  return compose(transparent_binary_operator<divides_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator/=()
 
-template<typename T>
-  struct modulus_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard modulus_equal functional, so roll an ad hoc one here
+struct modulus_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs %= rhs; }
-}; // end modulus_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) %= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) %= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) %= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<modulus_equal>,
+    transparent_binary_operator<modulus_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator%=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<modulus_equal>(),
+  return compose(transparent_binary_operator<modulus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%=()
@@ -211,37 +255,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<modulus_equal>,
+    transparent_binary_operator<modulus_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator%=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<modulus_equal>(),
+  return compose(transparent_binary_operator<modulus_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator%=()
 
-template<typename T>
-  struct bit_and_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_and_equal functional, so roll an ad hoc one here
+struct bit_and_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs &= rhs; }
-}; // end bit_and_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) &= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) &= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) &= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_and_equal>,
+    transparent_binary_operator<bit_and_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_and_equal>(),
+  return compose(transparent_binary_operator<bit_and_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&=()
@@ -250,37 +303,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_and_equal>,
+    transparent_binary_operator<bit_and_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_and_equal>(),
+  return compose(transparent_binary_operator<bit_and_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&=()
 
-template<typename T>
-  struct bit_or_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_or_equal functional, so roll an ad hoc one here
+struct bit_or_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs |= rhs; }
-}; // end bit_or_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) |= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) |= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) |= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_or_equal>,
+    transparent_binary_operator<bit_or_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator|=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_or_equal>(),
+  return compose(transparent_binary_operator<bit_or_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
@@ -289,37 +351,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_or_equal>,
+    transparent_binary_operator<bit_or_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator|=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_or_equal>(),
+  return compose(transparent_binary_operator<bit_or_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
 
-template<typename T>
-  struct bit_xor_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_xor_equal functional, so roll an ad hoc one here
+struct bit_xor_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs ^= rhs; }
-}; // end bit_xor_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) ^= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) ^= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) ^= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_xor_equal>,
+    transparent_binary_operator<bit_xor_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator^=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_xor_equal>(),
+  return compose(transparent_binary_operator<bit_xor_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
@@ -328,37 +399,45 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_xor_equal>,
+    transparent_binary_operator<bit_xor_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator^=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_xor_equal>(),
+  return compose(transparent_binary_operator<bit_xor_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator|=()
 
-template<typename T>
-  struct bit_lshift_equal
-    : public thrust::binary_function<T&,T,T&>
-{
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs <<= rhs; }
-}; // end bit_lshift_equal
-
+// there's no standard bit_lshift_equal functional, so roll an ad hoc one here
+struct bit_lshift_equal
+{
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) <<= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) <<= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) <<= THRUST_FWD(t2);
+  }
+};
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift_equal>,
+    transparent_binary_operator<bit_lshift_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<<=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_lshift_equal>(),
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<=()
@@ -367,37 +446,46 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_lshift_equal>,
+    transparent_binary_operator<bit_lshift_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<<=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_lshift_equal>(),
+  return compose(transparent_binary_operator<bit_lshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<<=()
 
-template<typename T>
-  struct bit_rshift_equal
-    : public thrust::binary_function<T&,T,T&>
+// there's no standard bit_rshift_equal functional, so roll an ad hoc one here
+struct bit_rshift_equal
 {
-  __host__ __device__ T& operator()(T &lhs, const T&rhs) const { return lhs >>= rhs; }
-}; // end bit_rshift_equal
+  using is_transparent = void;
+
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t1) >>= THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1) >>= THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t1) >>= THRUST_FWD(t2);
+  }
+};
 
 template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift_equal>,
+    transparent_binary_operator<bit_rshift_equal>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>>=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<bit_rshift_equal>(),
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>=()
@@ -406,19 +494,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<bit_rshift_equal>,
+    transparent_binary_operator<bit_rshift_equal>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>>=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<bit_rshift_equal>(),
+  return compose(transparent_binary_operator<bit_rshift_equal>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>>=()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/logical_operators.h b/thrust/detail/functional/operators/logical_operators.h
index f5e39e125..e1e4ff719 100644
--- a/thrust/detail/functional/operators/logical_operators.h
+++ b/thrust/detail/functional/operators/logical_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator&&(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator&&(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_and>,
+    transparent_binary_operator<thrust::logical_and<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator&&(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_and>(),
+  return compose(transparent_binary_operator<thrust::logical_and<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator||(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator||(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::logical_or>,
+    transparent_binary_operator<thrust::logical_or<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator||(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::logical_or>(),
+  return compose(transparent_binary_operator<thrust::logical_or<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator&&()
@@ -129,16 +128,16 @@ template<typename Eval>
 __host__ __device__
 actor<
   composite<
-    unary_operator<thrust::logical_not>,
+    transparent_unary_operator<thrust::logical_not<>>,
     actor<Eval>
   >
 >
 operator!(const actor<Eval> &_1)
 {
-  return compose(unary_operator<thrust::logical_not>(), _1);
+  return compose(transparent_unary_operator<thrust::logical_not<>>(), _1);
 } // end operator!()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/operator_adaptors.h b/thrust/detail/functional/operators/operator_adaptors.h
index 664921113..67326c5c1 100644
--- a/thrust/detail/functional/operators/operator_adaptors.h
+++ b/thrust/detail/functional/operators/operator_adaptors.h
@@ -17,99 +17,120 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/functional/argument.h>
+#include <thrust/detail/type_deduction.h>
 #include <thrust/tuple.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/void_t.h>
 
-namespace thrust
-{
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
 {
 
-// this thing (which models Eval) is an adaptor for the unary
-// functors inside functional.h
-template<template<typename> class UnaryOperator>
-  struct unary_operator
+// Adapts a transparent unary functor from functional.h (e.g. thrust::negate<>)
+// into the Eval interface.
+template <typename UnaryFunctor>
+struct transparent_unary_operator
 {
-  template<typename Env>
-    struct argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
+  template <typename>
+  using operator_type = UnaryFunctor;
 
-  template<typename Env>
-    struct operator_type
+  template <typename Env>
+  using argument =
+  typename thrust::detail::eval_if<
+    thrust::tuple_size<Env>::value != 1,
+    thrust::detail::identity_<thrust::null_type>,
+    thrust::detail::functional::argument_helper<0, Env>
+  >::type;
+
+  template <typename Env>
+  struct result_type_impl
   {
-    typedef UnaryOperator<
-      typename thrust::detail::remove_reference<
-        typename argument<Env>::type
-      >::type
-    > type;
+    using type = decltype(
+      std::declval<UnaryFunctor>()(std::declval<argument<Env>>()));
   };
 
-  template<typename Env>
-    struct result
+  template <typename Env>
+  using result_type =
+  typename thrust::detail::eval_if<
+    std::is_same<thrust::null_type, argument<Env>>::value,
+    thrust::detail::identity_<thrust::null_type>,
+    result_type_impl<Env>
+  >::type;
+
+  template <typename Env>
+  struct result
   {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
+    using op_type = UnaryFunctor;
+    using type = result_type<Env>;
   };
 
-  template<typename Env>
+  template <typename Env>
   __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e));
-  } // end eval()
-}; // end unary_operator
-
-// this thing (which models Eval) is an adaptor for the binary
-// functors inside functional.h
-template<template<typename> class BinaryOperator>
-  struct binary_operator
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(UnaryFunctor{}(thrust::get<0>(THRUST_FWD(e))))
+};
+
+
+// Adapts a transparent binary functor from functional.h (e.g. thrust::less<>)
+// into the Eval interface.
+template <typename BinaryFunctor>
+struct transparent_binary_operator
 {
-  template<typename Env>
-    struct first_argument
-      : thrust::detail::eval_if<
-          (thrust::tuple_size<Env>::value == 0),
-          thrust::detail::identity_<thrust::null_type>,
-          thrust::tuple_element<0,Env>
-        >
-  {
-  };
+  template <typename>
+  using operator_type = BinaryFunctor;
+
+  template <typename Env>
+  using first_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<0, Env>
+    >::type;
 
-  template<typename Env>
-    struct operator_type
+  template <typename Env>
+  using second_argument =
+    typename thrust::detail::eval_if<
+      thrust::tuple_size<Env>::value != 2,
+      thrust::detail::identity_<thrust::null_type>,
+      thrust::detail::functional::argument_helper<1, Env>
+    >::type;
+
+  template <typename Env>
+  struct result_type_impl
   {
-    typedef BinaryOperator<
-      typename thrust::detail::remove_reference<
-        typename first_argument<Env>::type
-      >::type
-    > type;
+    using type = decltype(
+      std::declval<BinaryFunctor>()(std::declval<first_argument<Env>>(),
+                                    std::declval<second_argument<Env>>()));
   };
 
-  template<typename Env>
-    struct result
+  template <typename Env>
+  using result_type =
+    typename thrust::detail::eval_if<
+      (std::is_same<thrust::null_type, first_argument<Env>>::value ||
+       std::is_same<thrust::null_type, second_argument<Env>>::value),
+      thrust::detail::identity_<thrust::null_type>,
+      result_type_impl<Env>
+    >::type;
+
+  template <typename Env>
+  struct result
   {
-    typedef typename operator_type<Env>::type op_type;
-    typedef typename op_type::result_type type;
+    using op_type = BinaryFunctor;
+    using type = result_type<Env>;
   };
 
-  template<typename Env>
+  template <typename Env>
   __host__ __device__
-  typename result<Env>::type eval(const Env &e) const
-  {
-    typename operator_type<Env>::type op;
-    return op(thrust::get<0>(e), thrust::get<1>(e));
-  } // end eval()
-}; // end binary_operator
+  result_type<Env> eval(Env&& e) const
+  THRUST_RETURNS(BinaryFunctor{}(thrust::get<0>(e), thrust::get<1>(e)))
+};
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/operators/relational_operators.h b/thrust/detail/functional/operators/relational_operators.h
index ec8864715..6c58325e2 100644
--- a/thrust/detail/functional/operators/relational_operators.h
+++ b/thrust/detail/functional/operators/relational_operators.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/functional/operators/operator_adaptors.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -33,14 +32,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator==(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -49,14 +48,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator==(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -65,14 +64,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::equal_to>,
+    transparent_binary_operator<thrust::equal_to<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator==(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::equal_to>(),
+  return compose(transparent_binary_operator<thrust::equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator==()
@@ -81,14 +80,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator!=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -97,14 +96,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator!=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -113,14 +112,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::not_equal_to>,
+    transparent_binary_operator<thrust::not_equal_to<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator!=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::not_equal_to>(),
+  return compose(transparent_binary_operator<thrust::not_equal_to<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator!=()
@@ -129,14 +128,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -145,14 +144,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -161,14 +160,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater>,
+    transparent_binary_operator<thrust::greater<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater>(),
+  return compose(transparent_binary_operator<thrust::greater<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>()
@@ -177,14 +176,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -193,14 +192,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -209,14 +208,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less>,
+    transparent_binary_operator<thrust::less<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less>(),
+  return compose(transparent_binary_operator<thrust::less<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<()
@@ -225,14 +224,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator>=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -241,14 +240,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator>=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -257,14 +256,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::greater_equal>,
+    transparent_binary_operator<thrust::greater_equal<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator>=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::greater_equal>(),
+  return compose(transparent_binary_operator<thrust::greater_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator>=()
@@ -273,14 +272,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     actor<T1>,
     typename as_actor<T2>::type
   >
 >
 operator<=(const actor<T1> &_1, const T2 &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
@@ -289,14 +288,14 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     typename as_actor<T1>::type,
     actor<T2>
   >
 >
 operator<=(const T1 &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
@@ -305,19 +304,19 @@ template<typename T1, typename T2>
 __host__ __device__
 actor<
   composite<
-    binary_operator<thrust::less_equal>,
+    transparent_binary_operator<thrust::less_equal<>>,
     actor<T1>,
     actor<T2>
   >
 >
 operator<=(const actor<T1> &_1, const actor<T2> &_2)
 {
-  return compose(binary_operator<thrust::less_equal>(),
+  return compose(transparent_binary_operator<thrust::less_equal<>>(),
                  make_actor(_1),
                  make_actor(_2));
 } // end operator<=()
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/placeholder.h b/thrust/detail/functional/placeholder.h
index d0832cfec..e3c083553 100644
--- a/thrust/detail/functional/placeholder.h
+++ b/thrust/detail/functional/placeholder.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/functional/actor.h>
 #include <thrust/detail/functional/argument.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -35,5 +34,5 @@ template<unsigned int i>
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/functional/value.h b/thrust/detail/functional/value.h
index 27a584676..d6b1563b1 100644
--- a/thrust/detail/functional/value.h
+++ b/thrust/detail/functional/value.h
@@ -28,8 +28,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/functional/actor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace functional
@@ -76,5 +75,5 @@ actor<value<T> > val(const T &x)
 
 } // end functional
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/gather.inl b/thrust/detail/gather.inl
index 4550742c5..3812702f6 100644
--- a/thrust/detail/gather.inl
+++ b/thrust/detail/gather.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file gather.inl
- *  \brief Inline file for gather.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/gather.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +24,7 @@
 #include <thrust/system/detail/generic/gather.h>
 #include <thrust/system/detail/adl/gather.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -96,9 +93,9 @@ template<typename InputIterator,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator>::type        System1; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System3; 
+  typedef typename thrust::iterator_system<InputIterator>::type        System1;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System2;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System3;
 
   System1 system1;
   System2 system2;
@@ -120,10 +117,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4;
 
   System1 system1;
   System2 system2;
@@ -148,10 +145,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
-  typedef typename thrust::iterator_system<OutputIterator>::type       System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
+  typedef typename thrust::iterator_system<OutputIterator>::type       System4;
 
   System1 system1;
   System2 system2;
@@ -161,6 +158,4 @@ template<typename InputIterator1,
   return thrust::gather_if(select_system(system1,system2,system3,system4), map_first, map_last, stencil, input_first, result, pred);
 } // end gather_if()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/generate.inl b/thrust/detail/generate.inl
index 2ce2ac936..2ecb65d58 100644
--- a/thrust/detail/generate.inl
+++ b/thrust/detail/generate.inl
@@ -14,11 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file generate.inl
- *  \author Jared Hoberock
- *  \brief Inline file for generate.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/generate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -26,9 +24,7 @@
 #include <thrust/system/detail/generic/generate.h>
 #include <thrust/system/detail/adl/generate.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -93,6 +89,4 @@ template<typename OutputIterator,
   return thrust::generate_n(select_system(system), first, n, gen);
 } // end generate_n()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/get_iterator_value.h b/thrust/detail/get_iterator_value.h
new file mode 100644
index 000000000..27e0a4e47
--- /dev/null
+++ b/thrust/detail/get_iterator_value.h
@@ -0,0 +1,55 @@
+#pragma once
+/*
+ *  Copyright 2008-2016 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/execution_policy.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/system/detail/generic/memory.h> // for get_value()
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail {
+
+// get_iterator_value specialization on iterators
+// --------------------------------------------------
+// it is okay to dereference iterator in the usual way
+template<typename DerivedPolicy, typename Iterator>
+__host__ __device__
+typename thrust::iterator_traits<Iterator>::value_type
+get_iterator_value(thrust::execution_policy<DerivedPolicy> &, Iterator it)
+{
+  return *it;
+} // get_iterator_value(exec,Iterator);
+
+// get_iterator_value specialization on pointer
+// ----------------------------------------------
+// we can't just dereference a pointer in the usual way, because
+// it may point to a location in the device memory. 
+// we use get_value(exec,pointer*) function
+// to perform a dereferencing consistent with the execution policy
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+typename thrust::detail::pointer_traits<Pointer*>::element_type 
+get_iterator_value(thrust::execution_policy<DerivedPolicy> &exec, Pointer* ptr)
+{
+  return get_value(derived_cast(exec),ptr);
+} // get_iterator_value(exec,Pointer*)
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/inner_product.inl b/thrust/detail/inner_product.inl
index 37247e68e..97cd2b0b5 100644
--- a/thrust/detail/inner_product.inl
+++ b/thrust/detail/inner_product.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file inner_product.inl
- *  \brief Inline file for inner_product.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/inner_product.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/inner_product.h>
 #include <thrust/system/detail/adl/inner_product.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -59,7 +54,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
                          InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
-                         OutputType init, 
+                         OutputType init,
                          BinaryFunction1 binary_op1,
                          BinaryFunction2 binary_op2)
 {
@@ -69,7 +64,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
 
 
 template<typename InputIterator1, typename InputIterator2, typename OutputType>
-OutputType 
+OutputType
 inner_product(InputIterator1 first1, InputIterator1 last1,
               InputIterator2 first2, OutputType init)
 {
@@ -89,7 +84,7 @@ template<typename InputIterator1, typename InputIterator2, typename OutputType,
          typename BinaryFunction1, typename BinaryFunction2>
 OutputType
 inner_product(InputIterator1 first1, InputIterator1 last1,
-              InputIterator2 first2, OutputType init, 
+              InputIterator2 first2, OutputType init,
               BinaryFunction1 binary_op1, BinaryFunction2 binary_op2)
 {
   using thrust::system::detail::generic::select_system;
@@ -103,6 +98,4 @@ inner_product(InputIterator1 first1, InputIterator1 last1,
   return thrust::inner_product(select_system(system1,system2), first1, last1, first2, init, binary_op1, binary_op2);
 } // end inner_product()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/integer_math.h b/thrust/detail/integer_math.h
index f9e8e5616..0f8c8aac1 100644
--- a/thrust/detail/integer_math.h
+++ b/thrust/detail/integer_math.h
@@ -17,45 +17,56 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <limits>
+#include <thrust/detail/type_deduction.h>
 
+#include <nv/target>
 
-namespace thrust
-{
+#include <limits>
+
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
-
-template<typename Integer>
+template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 Integer clz(Integer x)
 {
-  // XXX optimize by lowering to intrinsics
-  
-  int num_bits = 8 * sizeof(Integer);
-  int num_bits_minus_one = num_bits - 1;
-
-  for(int i = num_bits_minus_one; i >= 0; --i)
-  {
-    if((Integer(1) << i) & x)
+  Integer result;
+
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    result = ::__clz(x);
+  ), (
+    int num_bits = 8 * sizeof(Integer);
+    int num_bits_minus_one = num_bits - 1;
+    result = num_bits;
+    for (int i = num_bits_minus_one; i >= 0; --i)
     {
-      return num_bits_minus_one - i;
+      if ((Integer(1) << i) & x)
+      {
+        result = num_bits_minus_one - i;
+        break;
+      }
     }
-  }
+  ));
 
-  return num_bits;
+  return result;
 }
 
-
-template<typename Integer>
+template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 bool is_power_of_2(Integer x)
 {
   return 0 == (x & (x - 1));
 }
 
+template <typename Integer>
+__host__ __device__ __thrust_forceinline__
+bool is_odd(Integer x)
+{
+  return 1 & x;
+}
 
-template<typename Integer>
+template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 Integer log2(Integer x)
 {
@@ -66,30 +77,76 @@ Integer log2(Integer x)
 }
 
 
-template<typename Integer>
+template <typename Integer>
 __host__ __device__ __thrust_forceinline__
 Integer log2_ri(Integer x)
 {
   Integer result = log2(x);
 
-  // this is where we round up to the nearest log
-  if(!is_power_of_2(x))
-  {
+  // This is where we round up to the nearest log.
+  if (!is_power_of_2(x))
     ++result;
-  }
 
   return result;
 }
 
+// x/y rounding towards +infinity for integers
+// Used to determine # of blocks/warps etc.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+// FIXME: Should use common_type.
+auto divide_ri(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS((x + (y - 1)) / y)
+#else
+// FIXME: Should use common_type.
+Integer0 divide_ri(Integer0 const x, Integer1 const y)
+{
+  return (x + (y - 1)) / y;
+}
+#endif
 
-template<typename Integer>
+// x/y rounding towards zero for integers.
+// Used to determine # of blocks/warps etc.
+template <typename Integer0, typename Integer1>
 __host__ __device__ __thrust_forceinline__
-bool is_odd(Integer x)
+#if THRUST_CPP_DIALECT >= 2011
+auto divide_rz(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(x / y)
+#else
+// FIXME: Should use common_type.
+Integer0 divide_rz(Integer0 const x, Integer1 const y)
 {
-  return 1 & x;
+  return x / y;
+}
+#endif
+
+// Round x towards infinity to the next multiple of y.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+auto round_i(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(y * divide_ri(x, y))
+#else
+Integer0 round_i(Integer0 const x, Integer1 const y)
+{
+  return y * divide_ri(x, y);
 }
+#endif
 
+// Round x towards 0 to the next multiple of y.
+template <typename Integer0, typename Integer1>
+__host__ __device__ __thrust_forceinline__
+#if THRUST_CPP_DIALECT >= 2011
+auto round_z(Integer0 const x, Integer1 const y)
+THRUST_DECLTYPE_RETURNS(y * divide_rz(x, y))
+#else
+Integer0 round_z(Integer0 const x, Integer1 const y)
+{
+  return y * divide_rz(x, y);
+}
+#endif
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/integer_traits.h b/thrust/detail/integer_traits.h
index 97ab4f94d..853af20b8 100644
--- a/thrust/detail/integer_traits.h
+++ b/thrust/detail/integer_traits.h
@@ -20,8 +20,7 @@
 #include <limits>
 #include <limits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -30,16 +29,16 @@ template<typename T>
   class integer_traits
 {
   public:
-    static const bool is_integral = false;
+    static constexpr bool is_integral = false;
 };
 
 template<typename T, T min_val, T max_val>
   class integer_traits_base
 {
   public:
-    static const bool is_integral = true;
-    static const T const_min = min_val;
-    static const T const_max = max_val;
+    static constexpr bool is_integral = true;
+    static constexpr T const_min = min_val;
+    static constexpr T const_max = max_val;
 };
 
 
@@ -128,5 +127,4 @@ template<>
 
 } // end detail
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/internal_functional.h b/thrust/detail/internal_functional.h
index 0852c8e9b..a0c4056fe 100644
--- a/thrust/detail/internal_functional.h
+++ b/thrust/detail/internal_functional.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,13 +23,15 @@
 
 #include <thrust/tuple.h>
 #include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/tuple_of_iterator_references.h>
 #include <thrust/detail/raw_reference_cast.h>
-#include <memory> // for ::new
+#include <thrust/detail/memory_wrapper.h> // for ::new
+
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
 namespace detail
 {
 
@@ -38,12 +40,12 @@ template<typename Predicate>
 struct unary_negate
 {
   typedef bool result_type;
-  
+
   Predicate pred;
-  
+
   __host__ __device__
   explicit unary_negate(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T>
   __host__ __device__
   bool operator()(const T& x)
@@ -57,12 +59,12 @@ template<typename Predicate>
 struct binary_negate
 {
   typedef bool result_type;
-  
+
   Predicate pred;
-  
+
   __host__ __device__
   explicit binary_negate(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T1, typename T2>
   __host__ __device__
   bool operator()(const T1& x, const T2& y)
@@ -91,13 +93,13 @@ template<typename Predicate, typename IntegralType>
 struct predicate_to_integral
 {
   Predicate pred;
-  
+
   __host__ __device__
   explicit predicate_to_integral(const Predicate& pred) : pred(pred) {}
-  
+
   template <typename T>
   __host__ __device__
-  bool operator()(const T& x)
+  IntegralType operator()(const T& x)
   {
     return pred(x) ? IntegralType(1) : IntegralType(0);
   }
@@ -109,7 +111,7 @@ template<typename T1>
 struct equal_to
 {
   typedef bool result_type;
-  
+
   template <typename T2>
   __host__ __device__
   bool operator()(const T1& lhs, const T2& rhs) const
@@ -123,10 +125,10 @@ template<typename T2>
 struct equal_to_value
 {
   T2 rhs;
-  
+
   __host__ __device__
   equal_to_value(const T2& rhs) : rhs(rhs) {}
-  
+
   template <typename T1>
   __host__ __device__
   bool operator()(const T1& lhs) const
@@ -139,17 +141,17 @@ template<typename Predicate>
 struct tuple_binary_predicate
 {
   typedef bool result_type;
-  
+
   __host__ __device__
   tuple_binary_predicate(const Predicate& p) : pred(p) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   bool operator()(const Tuple& t) const
-  { 
+  {
     return pred(thrust::get<0>(t), thrust::get<1>(t));
   }
-  
+
   mutable Predicate pred;
 };
 
@@ -157,17 +159,17 @@ template<typename Predicate>
 struct tuple_not_binary_predicate
 {
   typedef bool result_type;
-  
+
   __host__ __device__
   tuple_not_binary_predicate(const Predicate& p) : pred(p) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   bool operator()(const Tuple& t) const
-  { 
+  {
     return !pred(thrust::get<0>(t), thrust::get<1>(t));
   }
-  
+
   mutable Predicate pred;
 };
 
@@ -176,6 +178,7 @@ template<typename Generator>
 {
   typedef void result_type;
 
+  __thrust_exec_check_disable__
   __host__ __device__
   host_generate_functor(Generator g)
     : gen(g) {}
@@ -209,6 +212,7 @@ template<typename Generator>
 {
   typedef void result_type;
 
+  __thrust_exec_check_disable__
   __host__ __device__
   device_generate_functor(Generator g)
     : gen(g) {}
@@ -271,19 +275,17 @@ template<typename T>
   struct is_non_const_reference
     : thrust::detail::and_<
         thrust::detail::not_<thrust::detail::is_const<T> >,
-        thrust::detail::is_reference<T>
+        thrust::detail::or_<thrust::detail::is_reference<T>,
+                            thrust::detail::is_proxy_reference<T> >
       >
 {};
 
 template<typename T> struct is_tuple_of_iterator_references : thrust::detail::false_type {};
 
-template<typename T1, typename T2, typename T3,
-         typename T4, typename T5, typename T6,
-         typename T7, typename T8, typename T9,
-         typename T10>
+template<typename... Ts>
   struct is_tuple_of_iterator_references<
     thrust::detail::tuple_of_iterator_references<
-      T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
+      Ts...
     >
   >
     : thrust::detail::true_type
@@ -407,7 +409,7 @@ struct binary_transform_if_functor
 
   __host__ __device__
   binary_transform_if_functor(BinaryFunction binary_op, Predicate pred)
-    : binary_op(binary_op), pred(pred) {} 
+    : binary_op(binary_op), pred(pred) {}
 
   __thrust_exec_check_disable__
   template<typename Tuple>
@@ -461,13 +463,24 @@ struct fill_functor
 {
   T exemplar;
 
+  __thrust_exec_check_disable__
   __host__ __device__
-  fill_functor(const T& _exemplar) 
+  fill_functor(const T& _exemplar)
     : exemplar(_exemplar) {}
 
+  __thrust_exec_check_disable__
+  __host__ __device__
+  fill_functor(const fill_functor & other)
+    :exemplar(other.exemplar){}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~fill_functor() {}
+
+  __thrust_exec_check_disable__
   __host__ __device__
   T operator()(void) const
-  { 
+  {
     return exemplar;
   }
 };
@@ -478,9 +491,20 @@ template<typename T>
 {
   T exemplar;
 
+  __thrust_exec_check_disable__
+  __host__ __device__
+  uninitialized_fill_functor(const T & x):exemplar(x){}
+
+  __thrust_exec_check_disable__
   __host__ __device__
-  uninitialized_fill_functor(T x):exemplar(x){}
+  uninitialized_fill_functor(const uninitialized_fill_functor & other)
+    :exemplar(other.exemplar){}
 
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~uninitialized_fill_functor() {}
+
+  __thrust_exec_check_disable__
   __host__ __device__
   void operator()(T &x)
   {
@@ -530,5 +554,5 @@ template<typename Compare>
 
 
 } // end namespace detail
-} // end namespace thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/logical.inl b/thrust/detail/logical.inl
index 2f428bc5f..3d39cac92 100644
--- a/thrust/detail/logical.inl
+++ b/thrust/detail/logical.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file logical.inl
- *  \brief Inline file for logical.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,9 +22,7 @@
 #include <thrust/system/detail/generic/logical.h>
 #include <thrust/system/detail/adl/logical.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
@@ -97,6 +92,4 @@ bool none_of(InputIterator first, InputIterator last, Predicate pred)
   return thrust::none_of(select_system(system), first, last, pred);
 }
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/malloc_and_free.h b/thrust/detail/malloc_and_free.h
index 00d9dff18..143518893 100644
--- a/thrust/detail/malloc_and_free.h
+++ b/thrust/detail/malloc_and_free.h
@@ -23,8 +23,7 @@
 #include <thrust/system/detail/generic/memory.h>
 #include <thrust/system/detail/adl/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy>
@@ -54,7 +53,7 @@ pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<Deri
 
 // XXX WAR nvbug 992955
 #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#if CUDA_VERSION < 5000
+#if CUDART_VERSION < 5000
 
 // cudafe generates unqualified calls to free(int *volatile)
 // which get confused with thrust::free
@@ -65,7 +64,7 @@ void free(int *volatile ptr)
   ::free(ptr);
 }
 
-#endif // CUDA_VERSION
+#endif // CUDART_VERSION
 #endif // THRUST_DEVICE_COMPILER
 
 __thrust_exec_check_disable__
@@ -81,5 +80,4 @@ void free(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Poin
 // XXX consider another form of free which does not take a system argument and
 // instead infers the system from the pointer
 
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/memory_algorithms.h b/thrust/detail/memory_algorithms.h
new file mode 100644
index 000000000..2f6b3a81d
--- /dev/null
+++ b/thrust/detail/memory_algorithms.h
@@ -0,0 +1,237 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+// TODO: These need to be turned into proper Thrust algorithms (dispatch layer,
+// backends, etc).
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/detail/memory_wrapper.h>
+#include <thrust/addressof.h>
+
+#include <nv/target>
+
+#include <utility>
+#include <new>
+
+
+THRUST_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+__host__ __device__
+void destroy_at(T* location)
+{
+  location->~T();
+}
+
+template <typename Allocator, typename T>
+__host__ __device__
+void destroy_at(Allocator const& alloc, T* location)
+{
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  traits::destroy(alloc_T, location);
+}
+
+template <typename ForwardIt>
+__host__ __device__
+ForwardIt destroy(ForwardIt first, ForwardIt last)
+{
+  for (; first != last; ++first)
+    destroy_at(addressof(*first));
+
+  return first;
+}
+
+template <typename Allocator, typename ForwardIt>
+__host__ __device__
+ForwardIt destroy(Allocator const& alloc, ForwardIt first, ForwardIt last)
+{
+  typedef typename iterator_traits<ForwardIt>::value_type T;
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  for (; first != last; ++first)
+    destroy_at(alloc_T, addressof(*first));
+
+  return first;
+}
+
+template <typename ForwardIt, typename Size>
+__host__ __device__
+ForwardIt destroy_n(ForwardIt first, Size n)
+{
+  for (; n > 0; (void) ++first, --n)
+    destroy_at(addressof(*first));
+
+  return first;
+}
+
+template <typename Allocator, typename ForwardIt, typename Size>
+__host__ __device__
+ForwardIt destroy_n(Allocator const& alloc, ForwardIt first, Size n)
+{
+  typedef typename iterator_traits<ForwardIt>::value_type T;
+  typedef typename detail::allocator_traits<
+    typename detail::remove_cv<
+      typename detail::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>::other traits;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  for (; n > 0; (void) ++first, --n)
+    destroy_at(alloc_T, addressof(*first));
+
+  return first;
+}
+
+template <typename ForwardIt, typename... Args>
+__host__ __device__
+void uninitialized_construct(
+  ForwardIt first, ForwardIt last, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  ForwardIt current = first;
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; current != last; ++current)
+      {
+        ::new (static_cast<void*>(addressof(*current))) T(args...);
+      }
+    } catch (...) {
+      destroy(first, current);
+      throw;
+    }
+  ), (
+    for (; current != last; ++current)
+    {
+      ::new (static_cast<void*>(addressof(*current))) T(args...);
+    }
+  ));
+}
+
+template <typename Allocator, typename ForwardIt, typename... Args>
+void uninitialized_construct_with_allocator(
+  Allocator const& alloc, ForwardIt first, ForwardIt last, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  ForwardIt current = first;
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; current != last; ++current)
+      {
+        traits::construct(alloc_T, addressof(*current), args...);
+      }
+    } catch (...) {
+      destroy(alloc_T, first, current);
+      throw;
+    }
+  ), (
+    for (; current != last; ++current)
+    {
+      traits::construct(alloc_T, addressof(*current), args...);
+    }
+  ));
+}
+
+template <typename ForwardIt, typename Size, typename... Args>
+void uninitialized_construct_n(
+  ForwardIt first, Size n, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  ForwardIt current = first;
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; n > 0; ++current, --n)
+      {
+        ::new (static_cast<void*>(addressof(*current))) T(args...);
+      }
+    } catch (...) {
+      destroy(first, current);
+      throw;
+    }
+  ), (
+    for (; n > 0; ++current, --n)
+    {
+      ::new (static_cast<void*>(addressof(*current))) T(args...);
+    }
+  ));
+}
+
+template <typename Allocator, typename ForwardIt, typename Size, typename... Args>
+void uninitialized_construct_n_with_allocator(
+  Allocator const& alloc, ForwardIt first, Size n, Args const&... args
+)
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+  using traits = typename detail::allocator_traits<
+    typename std::remove_cv<
+      typename std::remove_reference<Allocator>::type
+    >::type
+  >::template rebind_traits<T>;
+
+  typename traits::allocator_type alloc_T(alloc);
+
+  ForwardIt current = first;
+
+  // No exceptions in CUDA.
+  NV_IF_TARGET(NV_IS_HOST, (
+    try {
+      for (; n > 0; (void) ++current, --n)
+      {
+        traits::construct(alloc_T, addressof(*current), args...);
+      }
+    } catch (...) {
+      destroy(alloc_T, first, current);
+      throw;
+    }
+  ), (
+    for (; n > 0; (void) ++current, --n)
+    {
+      traits::construct(alloc_T, addressof(*current), args...);
+    }
+  ));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/memory_wrapper.h b/thrust/detail/memory_wrapper.h
new file mode 100644
index 000000000..bfc9056fa
--- /dev/null
+++ b/thrust/detail/memory_wrapper.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.  (<memory> declares several standard
+// algorithms, including all of the uninitialized_* algorithms.  "_ALGORITHMS_"
+// in the macro name is meant generically, not as a specific reference to
+// the header <algorithms>.)
+
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <memory>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/merge.inl b/thrust/detail/merge.inl
index d42475709..1595cc1a1 100644
--- a/thrust/detail/merge.inl
+++ b/thrust/detail/merge.inl
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-/*! \file merge.inl
- *  \brief Inline file for merge.h.
- */
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/merge.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -24,9 +24,7 @@
 #include <thrust/system/detail/generic/merge.h>
 #include <thrust/system/detail/adl/merge.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -220,6 +218,4 @@ template<typename InputIterator1,
   return thrust::merge_by_key(select_system(system1,system2,system3,system4,system5,system6), keys_first1, keys_last1, keys_first2, keys_last2, values_first1, values_first2, keys_result, values_result);
 } // end merge_by_key()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/minmax.h b/thrust/detail/minmax.h
index f59c64962..c565a74bd 100644
--- a/thrust/detail/minmax.h
+++ b/thrust/detail/minmax.h
@@ -18,9 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 template<typename T, typename BinaryPredicate>
 __host__ __device__
@@ -50,6 +48,4 @@ __host__ __device__
   return lhs < rhs ? rhs : lhs;
 } // end max()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/mismatch.inl b/thrust/detail/mismatch.inl
index 6c39aab86..16c579d80 100644
--- a/thrust/detail/mismatch.inl
+++ b/thrust/detail/mismatch.inl
@@ -14,11 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file mismatch.inl
- *  \brief Inline file for mismatch.h
- */
-
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/mismatch.h>
@@ -27,9 +23,7 @@
 #include <thrust/system/detail/generic/mismatch.h>
 #include <thrust/system/detail/adl/mismatch.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
@@ -92,6 +86,4 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
   return thrust::mismatch(select_system(system1,system2), first1, last1, first2, pred);
 } // end mismatch()
 
-
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp b/thrust/detail/modern_gcc_required.h
similarity index 61%
rename from thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp
rename to thrust/detail/modern_gcc_required.h
index d3014de70..a8c3d98ba 100644
--- a/thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp
+++ b/thrust/detail/modern_gcc_required.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,23 +16,11 @@
 
 #pragma once
 
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/iterator/detail/is_trivial_iterator.h>
+#include <thrust/detail/config/cpp_dialect.h>
 
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename T>
-  struct is_contiguous_iterator
-    : thrust::detail::is_trivial_iterator<T>
-{};
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
+#ifndef THRUST_MODERN_GCC_REQUIRED_NO_ERROR
+#  if defined(THRUST_GCC_VERSION) && !defined(THRUST_MODERN_GCC)
+#    error GCC 5 or later is required for this Thrust feature; please upgrade your compiler.
+#  endif
+#endif
 
diff --git a/thrust/detail/mpl/math.h b/thrust/detail/mpl/math.h
index 5356c9c15..bda98003c 100644
--- a/thrust/detail/mpl/math.h
+++ b/thrust/detail/mpl/math.h
@@ -22,8 +22,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -170,5 +171,5 @@ template<typename result_type, result_type x>
 
 } // end namespace detail
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/numeric_traits.h b/thrust/detail/numeric_traits.h
index 168b9ad0f..e728adcaf 100644
--- a/thrust/detail/numeric_traits.h
+++ b/thrust/detail/numeric_traits.h
@@ -16,13 +16,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 #include <limits>
 
 //#include <stdint.h> // for intmax_t (not provided on MSVS 2005)
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -126,5 +126,4 @@ numeric_distance(Number x, Number y)
 
 } // end detail
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/device_vector.inl b/thrust/detail/numeric_wrapper.h
similarity index 53%
rename from thrust/detail/device_vector.inl
rename to thrust/detail/numeric_wrapper.h
index e59b5670e..9ebc6e23b 100644
--- a/thrust/detail/device_vector.inl
+++ b/thrust/detail/numeric_wrapper.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,25 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file device_vector.inl
- *  \brief Inline file for device_vector.h.
- */
-
-#include <thrust/host_vector.h>
-
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector<T,Alloc>
-      ::device_vector(const host_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end device_vector::device_vector()
-
-} // end namespace thrust
+// When a compiler uses Thrust as part of its implementation of Standard C++
+// algorithms, a cycle of included files may result when Thrust code tries to
+// use a standard algorithm.  Having a macro that is defined only when Thrust
+// is including an algorithms-related header gives the compiler a chance to
+// detect and break the cycle of includes.
 
+#define THRUST_INCLUDING_ALGORITHMS_HEADER
+#include <numeric>
+#undef  THRUST_INCLUDING_ALGORITHMS_HEADER
diff --git a/thrust/detail/overlapped_copy.h b/thrust/detail/overlapped_copy.h
index f6bb85a91..418497de8 100644
--- a/thrust/detail/overlapped_copy.h
+++ b/thrust/detail/overlapped_copy.h
@@ -23,8 +23,8 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
+
 namespace detail
 {
 
@@ -127,5 +127,5 @@ template<typename RandomAccessIterator1,
 } // end overlapped_copy()
 
 } // end detail
-} // end thrust
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/pair.inl b/thrust/detail/pair.inl
index e5f15994e..4b7dd6eb0 100644
--- a/thrust/detail/pair.inl
+++ b/thrust/detail/pair.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,13 +14,18 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/pair.h>
 #include <thrust/detail/swap.h>
+#include <thrust/tuple.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <typename T1, typename T2>
+  __host__ __device__
   pair<T1,T2>
     ::pair(void)
       :first(),second()
@@ -30,6 +35,7 @@ template <typename T1, typename T2>
 
 
 template <typename T1, typename T2>
+  __host__ __device__
   pair<T1,T2>
     ::pair(const T1 &x, const T2 &y)
       :first(x),second(y)
@@ -40,6 +46,7 @@ template <typename T1, typename T2>
 
 template <typename T1, typename T2>
   template <typename U1, typename U2>
+    __host__ __device__
     pair<T1,T2>
       ::pair(const pair<U1,U2> &p)
         :first(p.first),second(p.second)
@@ -50,6 +57,7 @@ template <typename T1, typename T2>
 
 template <typename T1, typename T2>
   template <typename U1, typename U2>
+    __host__ __device__
     pair<T1,T2>
       ::pair(const std::pair<U1,U2> &p)
         :first(p.first),second(p.second)
@@ -136,13 +144,13 @@ template <typename T1, typename T2>
 
 // specializations of tuple_element for pair
 template<typename T1, typename T2>
-  struct tuple_element<0, pair<T1,T2> >
+  struct tuple_element<0, pair<T1,T2>>
 {
   typedef T1 type;
 }; // end tuple_element
 
 template<typename T1, typename T2>
-  struct tuple_element<1, pair<T1,T2> >
+  struct tuple_element<1, pair<T1,T2>>
 {
   typedef T2 type;
 }; // end tuple_element
@@ -150,7 +158,7 @@ template<typename T1, typename T2>
 
 // specialization of tuple_size for pair
 template<typename T1, typename T2>
-  struct tuple_size< pair<T1,T2 > >
+  struct tuple_size<pair<T1,T2>>
 {
   static const unsigned int value = 2;
 }; // end tuple_size
@@ -220,6 +228,4 @@ template<unsigned int N, typename T1, typename T2>
   return detail::pair_get<N, pair<T1,T2> >()(p);
 } // end get()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/partition.inl b/thrust/detail/partition.inl
index a667264c6..5c51bca80 100644
--- a/thrust/detail/partition.inl
+++ b/thrust/detail/partition.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file partition.inl
- *  \brief Inline file for partition.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/partition.h>
@@ -26,9 +23,7 @@
 #include <thrust/system/detail/generic/partition.h>
 #include <thrust/system/detail/adl/partition.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
@@ -413,6 +408,4 @@ template<typename InputIterator, typename Predicate>
   return thrust::is_partitioned(select_system(system), first, last, pred);
 } // end is_partitioned()
 
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/pointer.h b/thrust/detail/pointer.h
index 2e89d73a3..aed1fcc24 100644
--- a/thrust/detail/pointer.h
+++ b/thrust/detail/pointer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,53 +14,60 @@
  *  limitations under the License.
  */
 
+/*! \file
+ *  \brief A pointer to a variable which resides in memory associated with a
+ *  system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/detail/iterator_traversal_tags.h>
+#include <thrust/type_traits/remove_cvref.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/reference_forward_declaration.h>
 #include <ostream>
+#include <cstddef>
 
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
+template <typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default>
+class pointer;
 
-// declare pointer with default values of template parameters
-template<typename Element, typename Tag, typename Reference = use_default, typename Derived = use_default> class pointer;
-
-} // end thrust
+// Specialize `thrust::iterator_traits` to avoid problems with the name of
+// pointer's constructor shadowing its nested pointer type. We do this before
+// pointer is defined so the specialization is correctly used inside the
+// definition.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+struct iterator_traits<thrust::pointer<Element, Tag, Reference, Derived>>
+{
+  using pointer           = thrust::pointer<Element, Tag, Reference, Derived>;
+  using iterator_category = typename pointer::iterator_category;
+  using value_type        = typename pointer::value_type;
+  using difference_type   = typename pointer::difference_type;
+  using reference         = typename pointer::reference;
+};
 
+THRUST_NAMESPACE_END
 
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
+namespace std
 {
 
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct iterator_traits<thrust::pointer<Element,Tag,Reference,Derived> >
+template <typename Element, typename Tag, typename Reference, typename Derived>
+struct iterator_traits<THRUST_NS_QUALIFIER::pointer<Element, Tag, Reference, Derived>>
 {
-  private:
-    typedef thrust::pointer<Element,Tag,Reference,Derived> ptr;
-
-  public:
-    typedef typename ptr::iterator_category iterator_category;
-    typedef typename ptr::value_type        value_type;
-    typedef typename ptr::difference_type   difference_type;
-    // XXX implement this type (the result of operator->) later
-    typedef void                             pointer;
-    typedef typename ptr::reference         reference;
-}; // end iterator_traits
+  using pointer           = THRUST_NS_QUALIFIER::pointer<Element, Tag, Reference, Derived>;
+  using iterator_category = typename pointer::iterator_category;
+  using value_type        = typename pointer::value_type;
+  using difference_type   = typename pointer::difference_type;
+  using reference         = typename pointer::reference;
+};
 
-} // end thrust
+} // namespace std
 
-
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -72,7 +79,7 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   // void pointers should have no element type
   // note that we remove_cv from the Element type to get the value_type
   typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::is_void<typename thrust::remove_cvref<Element>::type>::value,
     thrust::detail::identity_<void>,
     thrust::detail::remove_cv<Element>
   >::type value_type;
@@ -87,14 +94,14 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   // void pointers should have no reference type
   // if no Reference type is given, just use reference
   typedef typename thrust::detail::eval_if<
-    thrust::detail::is_void<typename thrust::detail::remove_const<Element>::type>::value,
+    thrust::detail::is_void<typename thrust::remove_cvref<Element>::type>::value,
     thrust::detail::identity_<void>,
     thrust::detail::eval_if<
       thrust::detail::is_same<Reference,use_default>::value,
       thrust::detail::identity_<reference<Element,derived_type> >,
       thrust::detail::identity_<Reference>
     >
-  >::type reference_arg;
+  >::type reference_type;
 
   typedef thrust::iterator_adaptor<
     derived_type,                        // pass along the type of our Derived class to iterator_adaptor
@@ -102,7 +109,7 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
     value_type,                          // the value type
     Tag,                                 // system tag
     thrust::random_access_traversal_tag, // pointers have random access traversal
-    reference_arg,                       // pass along our Reference type
+    reference_type,                      // pass along our Reference type
     std::ptrdiff_t
   > type;
 }; // end pointer_base
@@ -116,7 +123,8 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 // 1. no-argument constructor
 // 2. constructor from OtherElement *
 // 3. constructor from OtherPointer related by convertibility
-// 4. assignment from OtherPointer related by convertibility
+// 4. constructor from OtherPointer to void
+// 5. assignment from OtherPointer related by convertibility
 // These should just call the corresponding members of pointer.
 template<typename Element, typename Tag, typename Reference, typename Derived>
   class pointer
@@ -141,10 +149,15 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
     typedef typename super_t::base_type raw_pointer;
 
     // constructors
-    
+
     __host__ __device__
     pointer();
 
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    pointer(std::nullptr_t);
+
     // OtherValue shall be convertible to Value
     // XXX consider making the pointer implementation a template parameter which defaults to Element *
     template<typename OtherElement>
@@ -161,8 +174,24 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
               pointer<Element,Tag,Reference,Derived>
             >::type * = 0);
 
+    // OtherPointer's element_type shall be void
+    // OtherPointer's system shall be convertible to Tag
+    template<typename OtherPointer>
+    __host__ __device__
+    explicit
+    pointer(const OtherPointer &other,
+            typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+              OtherPointer,
+              pointer<Element,Tag,Reference,Derived>
+            >::type * = 0);
+
     // assignment
-    
+
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    derived_type& operator=(std::nullptr_t);
+
     // OtherPointer's element_type shall be convertible to Element
     // OtherPointer's system shall be convertible to Tag
     template<typename OtherPointer>
@@ -178,16 +207,49 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 
     __host__ __device__
     Element *get() const;
+
+    __host__ __device__
+    Element *operator->() const;
+
+    // NOTE: This is needed so that Thrust smart pointers can be used in
+    // `std::unique_ptr`.
+    __host__ __device__
+    explicit operator bool() const;
+
+    __host__ __device__
+    static derived_type pointer_to(typename thrust::detail::pointer_traits_detail::pointer_to_param<Element>::type r)
+    {
+      return thrust::detail::pointer_traits<derived_type>::pointer_to(r);
+    }
 }; // end pointer
 
 // Output stream operator
 template<typename Element, typename Tag, typename Reference, typename Derived,
          typename charT, typename traits>
+__host__
 std::basic_ostream<charT, traits> &
 operator<<(std::basic_ostream<charT, traits> &os,
            const pointer<Element, Tag, Reference, Derived> &p);
 
-} // end thrust
+// NOTE: This is needed so that Thrust smart pointers can be used in
+// `std::unique_ptr`.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p);
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t);
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p);
+
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t);
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/pointer.inl>
 
diff --git a/thrust/detail/pointer.inl b/thrust/detail/pointer.inl
index 332ebebb5..de05ff20f 100644
--- a/thrust/detail/pointer.inl
+++ b/thrust/detail/pointer.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,23 +14,34 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
+
 #include <thrust/detail/pointer.h>
+#include <thrust/detail/type_traits.h>
 
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  pointer<Element,Tag,Reference,Derived>
+    ::pointer()
+      : super_t(static_cast<Element*>(nullptr))
+{} // end pointer::pointer
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
   pointer<Element,Tag,Reference,Derived>
-    ::pointer()
-      : super_t(static_cast<Element*>(0))
+    ::pointer(std::nullptr_t)
+      : super_t(static_cast<Element*>(nullptr))
 {} // end pointer::pointer
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
   template<typename OtherElement>
+    __host__ __device__
     pointer<Element,Tag,Reference,Derived>
       ::pointer(OtherElement *other)
         : super_t(other)
@@ -39,6 +50,7 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
   template<typename OtherPointer>
+    __host__ __device__
     pointer<Element,Tag,Reference,Derived>
       ::pointer(const OtherPointer &other,
                 typename thrust::detail::enable_if_pointer_is_convertible<
@@ -51,6 +63,31 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
   template<typename OtherPointer>
+    __host__ __device__
+    pointer<Element,Tag,Reference,Derived>
+      ::pointer(const OtherPointer &other,
+                typename thrust::detail::enable_if_void_pointer_is_system_convertible<
+                  OtherPointer,
+                  pointer<Element,Tag,Reference,Derived>
+                 >::type *)
+        : super_t(static_cast<Element *>(thrust::detail::pointer_traits<OtherPointer>::get(other)))
+{} // end pointer::pointer
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  typename pointer<Element,Tag,Reference,Derived>::derived_type &
+    pointer<Element,Tag,Reference,Derived>
+      ::operator=(decltype(nullptr))
+{
+  super_t::base_reference() = nullptr;
+  return static_cast<derived_type&>(*this);
+} // end pointer::operator=
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  template<typename OtherPointer>
+    __host__ __device__
     typename thrust::detail::enable_if_pointer_is_convertible<
       OtherPointer,
       pointer<Element,Tag,Reference,Derived>,
@@ -63,88 +100,110 @@ template<typename Element, typename Tag, typename Reference, typename Derived>
   return static_cast<derived_type&>(*this);
 } // end pointer::operator=
 
+namespace detail
+{
+
+// Implementation for dereference() when Reference is Element&,
+// e.g. cuda's managed_memory_pointer
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::true_type /* is_cpp_ref */)
+{
+  return *ptr.get();
+}
+
+// Implementation for pointers with proxy references:
+template <typename Reference, typename Derived>
+__host__ __device__
+Reference pointer_dereference_impl(const Derived& ptr,
+                                   thrust::detail::false_type /* is_cpp_ref */)
+{
+  return Reference(ptr);
+}
+
+} // namespace detail
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
   typename pointer<Element,Tag,Reference,Derived>::super_t::reference
-    pointer<Element,Tag,Reference,Derived>
-      ::dereference() const
+  pointer<Element,Tag,Reference,Derived>
+    ::dereference() const
 {
-  return typename super_t::reference(static_cast<const derived_type&>(*this));
+  // Need to handle cpp refs and fancy refs differently:
+  typedef typename super_t::reference RefT;
+  typedef typename thrust::detail::is_reference<RefT>::type IsCppRef;
+
+  const derived_type& derivedPtr = static_cast<const derived_type&>(*this);
+
+  return detail::pointer_dereference_impl<RefT>(derivedPtr, IsCppRef());
 } // end pointer::dereference
 
 
 template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
   Element *pointer<Element,Tag,Reference,Derived>
     ::get() const
 {
   return super_t::base();
 } // end pointer::get
 
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  Element *pointer<Element,Tag,Reference,Derived>
+    ::operator->() const
+{
+  return super_t::base();
+} // end pointer::operator->
+
+
+template<typename Element, typename Tag, typename Reference, typename Derived>
+  __host__ __device__
+  pointer<Element,Tag,Reference,Derived>
+    ::operator bool() const
+{
+  return bool(get());
+} // end pointer::operator bool
+
+
 template<typename Element, typename Tag, typename Reference, typename Derived,
          typename charT, typename traits>
+__host__
 std::basic_ostream<charT, traits> &
 operator<<(std::basic_ostream<charT, traits> &os,
            const pointer<Element, Tag, Reference, Derived> &p) {
   return os << p.get();
 }
 
-namespace detail
+// NOTE: These are needed so that Thrust smart pointers work with
+// `std::unique_ptr`.
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p)
 {
+  return nullptr == p.get();
+}
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-// XXX WAR MSVC 2005 problem with correctly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_raw_pointer< thrust::pointer<Element,Tag,Reference,Derived> >
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator==(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t)
 {
-  typedef typename pointer<Element,Tag,Reference,Derived>::raw_pointer type;
-}; // end pointer_raw_pointer
-#endif
-
+  return nullptr == p.get();
+}
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40200)
-// XXX WAR g++-4.1 problem with correctly implementing
-//     pointer_element for pointer by specializing it here
-template<typename Element, typename Tag>
-  struct pointer_element< thrust::pointer<Element,Tag> >
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(std::nullptr_t, pointer<Element, Tag, Reference, Derived> p)
 {
-  typedef Element type;
-}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference> >
-    : pointer_element< thrust::pointer<Element,Tag> >
-{}; // end pointer_element
-
-template<typename Element, typename Tag, typename Reference, typename Derived>
-  struct pointer_element< thrust::pointer<Element,Tag,Reference,Derived> >
-    : pointer_element< thrust::pointer<Element,Tag,Reference> >
-{}; // end pointer_element
-
-
+  return !(nullptr == p);
+}
 
-// XXX WAR g++-4.1 problem with correctly implementing
-//     rebind_pointer for pointer by specializing it here
-template<typename Element, typename Tag, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
+template <typename Element, typename Tag, typename Reference, typename Derived>
+__host__ __device__
+bool operator!=(pointer<Element, Tag, Reference, Derived> p, std::nullptr_t)
 {
-  // XXX note we don't attempt to rebind the pointer's Reference type (or Derived)
-  typedef thrust::pointer<NewElement,Tag> type;
-};
-
-template<typename Element, typename Tag, typename Reference, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag>, NewElement>
-{};
-
-template<typename Element, typename Tag, typename Reference, typename Derived, typename NewElement>
-  struct rebind_pointer<thrust::pointer<Element,Tag,Reference,Derived>, NewElement>
-    : rebind_pointer<thrust::pointer<Element,Tag,Reference>, NewElement>
-{};
-#endif
-
-} // end namespace detail
-
-
-} // end thrust
+  return !(nullptr == p);
+}
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/preprocessor.h b/thrust/detail/preprocessor.h
new file mode 100644
index 000000000..2e850c764
--- /dev/null
+++ b/thrust/detail/preprocessor.h
@@ -0,0 +1,1182 @@
+// Copyright (c) 2017-2018 NVIDIA Corporation
+// Copyright (c) 2014-2018 Bryce Adelstein Lelbach
+// Copyright (c) 2001-2015 Housemarque Oy (housemarque.com)
+// Copyright (c) 2007-2015 Hartmut Kaiser
+// Copyright (c)      2002 Peter Dimov and Multi Media Ltd
+//                         (`THRUST_CURRENT_FUNCTION`)
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_STRINGIZE(expr)
+/// \brief Stringizes the expression \a expr.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_STRINGIZE(foo) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << "foo" << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_STRINGIZE(expr) THRUST_PP_STRINGIZE_IMPL0(expr)
+#define THRUST_PP_STRINGIZE_IMPL0(expr) #expr
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_CAT2(a, b)
+/// \brief Concatenates the tokens \a a and \b b.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_CAT2(1, THRUST_PP_CAT2(2, 3)) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 123 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_CAT2(a, b) THRUST_PP_CAT2_IMPL0(a, b)
+
+#if    defined(_MSC_VER)                                                      \
+  && (defined(__EDG__) || defined(__EDG_VERSION__))                         \
+  && (defined(__INTELLISENSE__) || __EDG_VERSION__ >= 308)
+  #define THRUST_PP_CAT2_IMPL0(a, b) THRUST_PP_CAT2_IMPL1(~, a ## b)
+  #define THRUST_PP_CAT2_IMPL1(p, res) res
+#else
+  #define THRUST_PP_CAT2_IMPL0(a, b) a ## b
+#endif
+
+#define THRUST_PP_CAT3(a, b, c)                                               \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b, c))                                                     \
+  /**/
+
+#define THRUST_PP_CAT4(a, b, c, d)                                            \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b,                                                         \
+      THRUST_PP_CAT2(c, d)))                                                  \
+  /**/
+
+#define THRUST_PP_CAT5(a, b, c, d, e)                                         \
+  THRUST_PP_CAT2(a,                                                           \
+    THRUST_PP_CAT2(b,                                                         \
+      THRUST_PP_CAT2(c,                                                       \
+        THRUST_PP_CAT2(d, e))))                                               \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_EXPAND(x)
+/// \brief Performs macro expansion on \a x.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// #define FOO_BAR() "foo_bar"
+/// #define BUZZ()     THRUST_PP_EXPAND(THRUST_PP_CAT2(FOO_, BAR)())
+///
+/// int main()
+/// {
+///   std::cout << BUZZ() << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << "foo_bar" << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_EXPAND(x) THRUST_PP_EXPAND_IMPL0(x)
+#define THRUST_PP_EXPAND_IMPL0(x) x
+
+#define THRUST_PP_EXPAND_ARGS(...) THRUST_PP_EXPAND_ARGS_IMPL0(__VA_ARGS__)
+#define THRUST_PP_EXPAND_ARGS_IMPL0(...) __VA_ARGS__
+
+#define THRUST_PP_HEAD(x, ...) x
+
+#define THRUST_PP_TAIL(x, ...) __VA_ARGS__
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_EMPTY()
+
+#define THRUST_PP_COMMA() ,
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_INC(x) THRUST_PP_INC_IMPL0(x)
+
+#define THRUST_PP_INC_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_INC_IMPL_TAG, x)
+
+#define THRUST_PP_INC_IMPL_TAG0 1
+#define THRUST_PP_INC_IMPL_TAG1 2
+#define THRUST_PP_INC_IMPL_TAG2 3
+#define THRUST_PP_INC_IMPL_TAG3 4
+#define THRUST_PP_INC_IMPL_TAG4 5
+#define THRUST_PP_INC_IMPL_TAG5 6
+#define THRUST_PP_INC_IMPL_TAG6 7
+#define THRUST_PP_INC_IMPL_TAG7 8
+#define THRUST_PP_INC_IMPL_TAG8 9
+#define THRUST_PP_INC_IMPL_TAG9 10
+#define THRUST_PP_INC_IMPL_TAG10 11
+#define THRUST_PP_INC_IMPL_TAG11 12
+#define THRUST_PP_INC_IMPL_TAG12 13
+#define THRUST_PP_INC_IMPL_TAG13 14
+#define THRUST_PP_INC_IMPL_TAG14 15
+#define THRUST_PP_INC_IMPL_TAG15 16
+#define THRUST_PP_INC_IMPL_TAG16 17
+#define THRUST_PP_INC_IMPL_TAG17 18
+#define THRUST_PP_INC_IMPL_TAG18 19
+#define THRUST_PP_INC_IMPL_TAG19 20
+#define THRUST_PP_INC_IMPL_TAG20 21
+#define THRUST_PP_INC_IMPL_TAG21 22
+#define THRUST_PP_INC_IMPL_TAG22 23
+#define THRUST_PP_INC_IMPL_TAG23 24
+#define THRUST_PP_INC_IMPL_TAG24 25
+#define THRUST_PP_INC_IMPL_TAG25 26
+#define THRUST_PP_INC_IMPL_TAG26 27
+#define THRUST_PP_INC_IMPL_TAG27 28
+#define THRUST_PP_INC_IMPL_TAG28 29
+#define THRUST_PP_INC_IMPL_TAG29 30
+#define THRUST_PP_INC_IMPL_TAG30 31
+#define THRUST_PP_INC_IMPL_TAG31 32
+#define THRUST_PP_INC_IMPL_TAG32 33
+#define THRUST_PP_INC_IMPL_TAG33 34
+#define THRUST_PP_INC_IMPL_TAG34 35
+#define THRUST_PP_INC_IMPL_TAG35 36
+#define THRUST_PP_INC_IMPL_TAG36 37
+#define THRUST_PP_INC_IMPL_TAG37 38
+#define THRUST_PP_INC_IMPL_TAG38 39
+#define THRUST_PP_INC_IMPL_TAG39 40
+#define THRUST_PP_INC_IMPL_TAG40 41
+#define THRUST_PP_INC_IMPL_TAG41 42
+#define THRUST_PP_INC_IMPL_TAG42 43
+#define THRUST_PP_INC_IMPL_TAG43 44
+#define THRUST_PP_INC_IMPL_TAG44 45
+#define THRUST_PP_INC_IMPL_TAG45 46
+#define THRUST_PP_INC_IMPL_TAG46 47
+#define THRUST_PP_INC_IMPL_TAG47 48
+#define THRUST_PP_INC_IMPL_TAG48 49
+#define THRUST_PP_INC_IMPL_TAG49 50
+#define THRUST_PP_INC_IMPL_TAG50 51
+#define THRUST_PP_INC_IMPL_TAG51 52
+#define THRUST_PP_INC_IMPL_TAG52 53
+#define THRUST_PP_INC_IMPL_TAG53 54
+#define THRUST_PP_INC_IMPL_TAG54 55
+#define THRUST_PP_INC_IMPL_TAG55 56
+#define THRUST_PP_INC_IMPL_TAG56 57
+#define THRUST_PP_INC_IMPL_TAG57 58
+#define THRUST_PP_INC_IMPL_TAG58 59
+#define THRUST_PP_INC_IMPL_TAG59 60
+#define THRUST_PP_INC_IMPL_TAG60 61
+#define THRUST_PP_INC_IMPL_TAG61 62
+#define THRUST_PP_INC_IMPL_TAG62 63
+#define THRUST_PP_INC_IMPL_TAG63 64
+#define THRUST_PP_INC_IMPL_TAG64 65
+#define THRUST_PP_INC_IMPL_TAG65 66
+#define THRUST_PP_INC_IMPL_TAG66 67
+#define THRUST_PP_INC_IMPL_TAG67 68
+#define THRUST_PP_INC_IMPL_TAG68 69
+#define THRUST_PP_INC_IMPL_TAG69 70
+#define THRUST_PP_INC_IMPL_TAG70 71
+#define THRUST_PP_INC_IMPL_TAG71 72
+#define THRUST_PP_INC_IMPL_TAG72 73
+#define THRUST_PP_INC_IMPL_TAG73 74
+#define THRUST_PP_INC_IMPL_TAG74 75
+#define THRUST_PP_INC_IMPL_TAG75 76
+#define THRUST_PP_INC_IMPL_TAG76 77
+#define THRUST_PP_INC_IMPL_TAG77 78
+#define THRUST_PP_INC_IMPL_TAG78 79
+#define THRUST_PP_INC_IMPL_TAG79 80
+#define THRUST_PP_INC_IMPL_TAG80 81
+#define THRUST_PP_INC_IMPL_TAG81 82
+#define THRUST_PP_INC_IMPL_TAG82 83
+#define THRUST_PP_INC_IMPL_TAG83 84
+#define THRUST_PP_INC_IMPL_TAG84 85
+#define THRUST_PP_INC_IMPL_TAG85 86
+#define THRUST_PP_INC_IMPL_TAG86 87
+#define THRUST_PP_INC_IMPL_TAG87 88
+#define THRUST_PP_INC_IMPL_TAG88 89
+#define THRUST_PP_INC_IMPL_TAG89 90
+#define THRUST_PP_INC_IMPL_TAG90 91
+#define THRUST_PP_INC_IMPL_TAG91 92
+#define THRUST_PP_INC_IMPL_TAG92 93
+#define THRUST_PP_INC_IMPL_TAG93 94
+#define THRUST_PP_INC_IMPL_TAG94 95
+#define THRUST_PP_INC_IMPL_TAG95 96
+#define THRUST_PP_INC_IMPL_TAG96 97
+#define THRUST_PP_INC_IMPL_TAG97 98
+#define THRUST_PP_INC_IMPL_TAG98 99
+#define THRUST_PP_INC_IMPL_TAG99 100
+#define THRUST_PP_INC_IMPL_TAG100 101
+#define THRUST_PP_INC_IMPL_TAG101 102
+#define THRUST_PP_INC_IMPL_TAG102 103
+#define THRUST_PP_INC_IMPL_TAG103 104
+#define THRUST_PP_INC_IMPL_TAG104 105
+#define THRUST_PP_INC_IMPL_TAG105 106
+#define THRUST_PP_INC_IMPL_TAG106 107
+#define THRUST_PP_INC_IMPL_TAG107 108
+#define THRUST_PP_INC_IMPL_TAG108 109
+#define THRUST_PP_INC_IMPL_TAG109 110
+#define THRUST_PP_INC_IMPL_TAG110 111
+#define THRUST_PP_INC_IMPL_TAG111 112
+#define THRUST_PP_INC_IMPL_TAG112 113
+#define THRUST_PP_INC_IMPL_TAG113 114
+#define THRUST_PP_INC_IMPL_TAG114 115
+#define THRUST_PP_INC_IMPL_TAG115 116
+#define THRUST_PP_INC_IMPL_TAG116 117
+#define THRUST_PP_INC_IMPL_TAG117 118
+#define THRUST_PP_INC_IMPL_TAG118 119
+#define THRUST_PP_INC_IMPL_TAG119 120
+#define THRUST_PP_INC_IMPL_TAG120 121
+#define THRUST_PP_INC_IMPL_TAG121 122
+#define THRUST_PP_INC_IMPL_TAG122 123
+#define THRUST_PP_INC_IMPL_TAG123 124
+#define THRUST_PP_INC_IMPL_TAG124 125
+#define THRUST_PP_INC_IMPL_TAG125 126
+#define THRUST_PP_INC_IMPL_TAG126 127
+#define THRUST_PP_INC_IMPL_TAG127 128
+#define THRUST_PP_INC_IMPL_TAG128 129
+#define THRUST_PP_INC_IMPL_TAG129 130
+#define THRUST_PP_INC_IMPL_TAG130 131
+#define THRUST_PP_INC_IMPL_TAG131 132
+#define THRUST_PP_INC_IMPL_TAG132 133
+#define THRUST_PP_INC_IMPL_TAG133 134
+#define THRUST_PP_INC_IMPL_TAG134 135
+#define THRUST_PP_INC_IMPL_TAG135 136
+#define THRUST_PP_INC_IMPL_TAG136 137
+#define THRUST_PP_INC_IMPL_TAG137 138
+#define THRUST_PP_INC_IMPL_TAG138 139
+#define THRUST_PP_INC_IMPL_TAG139 140
+#define THRUST_PP_INC_IMPL_TAG140 141
+#define THRUST_PP_INC_IMPL_TAG141 142
+#define THRUST_PP_INC_IMPL_TAG142 143
+#define THRUST_PP_INC_IMPL_TAG143 144
+#define THRUST_PP_INC_IMPL_TAG144 145
+#define THRUST_PP_INC_IMPL_TAG145 146
+#define THRUST_PP_INC_IMPL_TAG146 147
+#define THRUST_PP_INC_IMPL_TAG147 148
+#define THRUST_PP_INC_IMPL_TAG148 149
+#define THRUST_PP_INC_IMPL_TAG149 150
+#define THRUST_PP_INC_IMPL_TAG150 151
+#define THRUST_PP_INC_IMPL_TAG151 152
+#define THRUST_PP_INC_IMPL_TAG152 153
+#define THRUST_PP_INC_IMPL_TAG153 154
+#define THRUST_PP_INC_IMPL_TAG154 155
+#define THRUST_PP_INC_IMPL_TAG155 156
+#define THRUST_PP_INC_IMPL_TAG156 157
+#define THRUST_PP_INC_IMPL_TAG157 158
+#define THRUST_PP_INC_IMPL_TAG158 159
+#define THRUST_PP_INC_IMPL_TAG159 160
+#define THRUST_PP_INC_IMPL_TAG160 161
+#define THRUST_PP_INC_IMPL_TAG161 162
+#define THRUST_PP_INC_IMPL_TAG162 163
+#define THRUST_PP_INC_IMPL_TAG163 164
+#define THRUST_PP_INC_IMPL_TAG164 165
+#define THRUST_PP_INC_IMPL_TAG165 166
+#define THRUST_PP_INC_IMPL_TAG166 167
+#define THRUST_PP_INC_IMPL_TAG167 168
+#define THRUST_PP_INC_IMPL_TAG168 169
+#define THRUST_PP_INC_IMPL_TAG169 170
+#define THRUST_PP_INC_IMPL_TAG170 171
+#define THRUST_PP_INC_IMPL_TAG171 172
+#define THRUST_PP_INC_IMPL_TAG172 173
+#define THRUST_PP_INC_IMPL_TAG173 174
+#define THRUST_PP_INC_IMPL_TAG174 175
+#define THRUST_PP_INC_IMPL_TAG175 176
+#define THRUST_PP_INC_IMPL_TAG176 177
+#define THRUST_PP_INC_IMPL_TAG177 178
+#define THRUST_PP_INC_IMPL_TAG178 179
+#define THRUST_PP_INC_IMPL_TAG179 180
+#define THRUST_PP_INC_IMPL_TAG180 181
+#define THRUST_PP_INC_IMPL_TAG181 182
+#define THRUST_PP_INC_IMPL_TAG182 183
+#define THRUST_PP_INC_IMPL_TAG183 184
+#define THRUST_PP_INC_IMPL_TAG184 185
+#define THRUST_PP_INC_IMPL_TAG185 186
+#define THRUST_PP_INC_IMPL_TAG186 187
+#define THRUST_PP_INC_IMPL_TAG187 188
+#define THRUST_PP_INC_IMPL_TAG188 189
+#define THRUST_PP_INC_IMPL_TAG189 190
+#define THRUST_PP_INC_IMPL_TAG190 191
+#define THRUST_PP_INC_IMPL_TAG191 192
+#define THRUST_PP_INC_IMPL_TAG192 193
+#define THRUST_PP_INC_IMPL_TAG193 194
+#define THRUST_PP_INC_IMPL_TAG194 195
+#define THRUST_PP_INC_IMPL_TAG195 196
+#define THRUST_PP_INC_IMPL_TAG196 197
+#define THRUST_PP_INC_IMPL_TAG197 198
+#define THRUST_PP_INC_IMPL_TAG198 199
+#define THRUST_PP_INC_IMPL_TAG199 200
+#define THRUST_PP_INC_IMPL_TAG200 201
+#define THRUST_PP_INC_IMPL_TAG201 202
+#define THRUST_PP_INC_IMPL_TAG202 203
+#define THRUST_PP_INC_IMPL_TAG203 204
+#define THRUST_PP_INC_IMPL_TAG204 205
+#define THRUST_PP_INC_IMPL_TAG205 206
+#define THRUST_PP_INC_IMPL_TAG206 207
+#define THRUST_PP_INC_IMPL_TAG207 208
+#define THRUST_PP_INC_IMPL_TAG208 209
+#define THRUST_PP_INC_IMPL_TAG209 210
+#define THRUST_PP_INC_IMPL_TAG210 211
+#define THRUST_PP_INC_IMPL_TAG211 212
+#define THRUST_PP_INC_IMPL_TAG212 213
+#define THRUST_PP_INC_IMPL_TAG213 214
+#define THRUST_PP_INC_IMPL_TAG214 215
+#define THRUST_PP_INC_IMPL_TAG215 216
+#define THRUST_PP_INC_IMPL_TAG216 217
+#define THRUST_PP_INC_IMPL_TAG217 218
+#define THRUST_PP_INC_IMPL_TAG218 219
+#define THRUST_PP_INC_IMPL_TAG219 220
+#define THRUST_PP_INC_IMPL_TAG220 221
+#define THRUST_PP_INC_IMPL_TAG221 222
+#define THRUST_PP_INC_IMPL_TAG222 223
+#define THRUST_PP_INC_IMPL_TAG223 224
+#define THRUST_PP_INC_IMPL_TAG224 225
+#define THRUST_PP_INC_IMPL_TAG225 226
+#define THRUST_PP_INC_IMPL_TAG226 227
+#define THRUST_PP_INC_IMPL_TAG227 228
+#define THRUST_PP_INC_IMPL_TAG228 229
+#define THRUST_PP_INC_IMPL_TAG229 230
+#define THRUST_PP_INC_IMPL_TAG230 231
+#define THRUST_PP_INC_IMPL_TAG231 232
+#define THRUST_PP_INC_IMPL_TAG232 233
+#define THRUST_PP_INC_IMPL_TAG233 234
+#define THRUST_PP_INC_IMPL_TAG234 235
+#define THRUST_PP_INC_IMPL_TAG235 236
+#define THRUST_PP_INC_IMPL_TAG236 237
+#define THRUST_PP_INC_IMPL_TAG237 238
+#define THRUST_PP_INC_IMPL_TAG238 239
+#define THRUST_PP_INC_IMPL_TAG239 240
+#define THRUST_PP_INC_IMPL_TAG240 241
+#define THRUST_PP_INC_IMPL_TAG241 242
+#define THRUST_PP_INC_IMPL_TAG242 243
+#define THRUST_PP_INC_IMPL_TAG243 244
+#define THRUST_PP_INC_IMPL_TAG244 245
+#define THRUST_PP_INC_IMPL_TAG245 246
+#define THRUST_PP_INC_IMPL_TAG246 247
+#define THRUST_PP_INC_IMPL_TAG247 248
+#define THRUST_PP_INC_IMPL_TAG248 249
+#define THRUST_PP_INC_IMPL_TAG249 250
+#define THRUST_PP_INC_IMPL_TAG250 251
+#define THRUST_PP_INC_IMPL_TAG251 252
+#define THRUST_PP_INC_IMPL_TAG252 253
+#define THRUST_PP_INC_IMPL_TAG253 254
+#define THRUST_PP_INC_IMPL_TAG254 255
+#define THRUST_PP_INC_IMPL_TAG255 256
+#define THRUST_PP_INC_IMPL_TAG256 256
+
+#define THRUST_PP_DEC(x) THRUST_PP_DEC_IMPL0(x)
+
+#define THRUST_PP_DEC_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_DEC_IMPL_TAG, x)
+
+#define THRUST_PP_DEC_IMPL_TAG0 0
+#define THRUST_PP_DEC_IMPL_TAG1 0
+#define THRUST_PP_DEC_IMPL_TAG2 1
+#define THRUST_PP_DEC_IMPL_TAG3 2
+#define THRUST_PP_DEC_IMPL_TAG4 3
+#define THRUST_PP_DEC_IMPL_TAG5 4
+#define THRUST_PP_DEC_IMPL_TAG6 5
+#define THRUST_PP_DEC_IMPL_TAG7 6
+#define THRUST_PP_DEC_IMPL_TAG8 7
+#define THRUST_PP_DEC_IMPL_TAG9 8
+#define THRUST_PP_DEC_IMPL_TAG10 9
+#define THRUST_PP_DEC_IMPL_TAG11 10
+#define THRUST_PP_DEC_IMPL_TAG12 11
+#define THRUST_PP_DEC_IMPL_TAG13 12
+#define THRUST_PP_DEC_IMPL_TAG14 13
+#define THRUST_PP_DEC_IMPL_TAG15 14
+#define THRUST_PP_DEC_IMPL_TAG16 15
+#define THRUST_PP_DEC_IMPL_TAG17 16
+#define THRUST_PP_DEC_IMPL_TAG18 17
+#define THRUST_PP_DEC_IMPL_TAG19 18
+#define THRUST_PP_DEC_IMPL_TAG20 19
+#define THRUST_PP_DEC_IMPL_TAG21 20
+#define THRUST_PP_DEC_IMPL_TAG22 21
+#define THRUST_PP_DEC_IMPL_TAG23 22
+#define THRUST_PP_DEC_IMPL_TAG24 23
+#define THRUST_PP_DEC_IMPL_TAG25 24
+#define THRUST_PP_DEC_IMPL_TAG26 25
+#define THRUST_PP_DEC_IMPL_TAG27 26
+#define THRUST_PP_DEC_IMPL_TAG28 27
+#define THRUST_PP_DEC_IMPL_TAG29 28
+#define THRUST_PP_DEC_IMPL_TAG30 29
+#define THRUST_PP_DEC_IMPL_TAG31 30
+#define THRUST_PP_DEC_IMPL_TAG32 31
+#define THRUST_PP_DEC_IMPL_TAG33 32
+#define THRUST_PP_DEC_IMPL_TAG34 33
+#define THRUST_PP_DEC_IMPL_TAG35 34
+#define THRUST_PP_DEC_IMPL_TAG36 35
+#define THRUST_PP_DEC_IMPL_TAG37 36
+#define THRUST_PP_DEC_IMPL_TAG38 37
+#define THRUST_PP_DEC_IMPL_TAG39 38
+#define THRUST_PP_DEC_IMPL_TAG40 39
+#define THRUST_PP_DEC_IMPL_TAG41 40
+#define THRUST_PP_DEC_IMPL_TAG42 41
+#define THRUST_PP_DEC_IMPL_TAG43 42
+#define THRUST_PP_DEC_IMPL_TAG44 43
+#define THRUST_PP_DEC_IMPL_TAG45 44
+#define THRUST_PP_DEC_IMPL_TAG46 45
+#define THRUST_PP_DEC_IMPL_TAG47 46
+#define THRUST_PP_DEC_IMPL_TAG48 47
+#define THRUST_PP_DEC_IMPL_TAG49 48
+#define THRUST_PP_DEC_IMPL_TAG50 49
+#define THRUST_PP_DEC_IMPL_TAG51 50
+#define THRUST_PP_DEC_IMPL_TAG52 51
+#define THRUST_PP_DEC_IMPL_TAG53 52
+#define THRUST_PP_DEC_IMPL_TAG54 53
+#define THRUST_PP_DEC_IMPL_TAG55 54
+#define THRUST_PP_DEC_IMPL_TAG56 55
+#define THRUST_PP_DEC_IMPL_TAG57 56
+#define THRUST_PP_DEC_IMPL_TAG58 57
+#define THRUST_PP_DEC_IMPL_TAG59 58
+#define THRUST_PP_DEC_IMPL_TAG60 59
+#define THRUST_PP_DEC_IMPL_TAG61 60
+#define THRUST_PP_DEC_IMPL_TAG62 61
+#define THRUST_PP_DEC_IMPL_TAG63 62
+#define THRUST_PP_DEC_IMPL_TAG64 63
+#define THRUST_PP_DEC_IMPL_TAG65 64
+#define THRUST_PP_DEC_IMPL_TAG66 65
+#define THRUST_PP_DEC_IMPL_TAG67 66
+#define THRUST_PP_DEC_IMPL_TAG68 67
+#define THRUST_PP_DEC_IMPL_TAG69 68
+#define THRUST_PP_DEC_IMPL_TAG70 69
+#define THRUST_PP_DEC_IMPL_TAG71 70
+#define THRUST_PP_DEC_IMPL_TAG72 71
+#define THRUST_PP_DEC_IMPL_TAG73 72
+#define THRUST_PP_DEC_IMPL_TAG74 73
+#define THRUST_PP_DEC_IMPL_TAG75 74
+#define THRUST_PP_DEC_IMPL_TAG76 75
+#define THRUST_PP_DEC_IMPL_TAG77 76
+#define THRUST_PP_DEC_IMPL_TAG78 77
+#define THRUST_PP_DEC_IMPL_TAG79 78
+#define THRUST_PP_DEC_IMPL_TAG80 79
+#define THRUST_PP_DEC_IMPL_TAG81 80
+#define THRUST_PP_DEC_IMPL_TAG82 81
+#define THRUST_PP_DEC_IMPL_TAG83 82
+#define THRUST_PP_DEC_IMPL_TAG84 83
+#define THRUST_PP_DEC_IMPL_TAG85 84
+#define THRUST_PP_DEC_IMPL_TAG86 85
+#define THRUST_PP_DEC_IMPL_TAG87 86
+#define THRUST_PP_DEC_IMPL_TAG88 87
+#define THRUST_PP_DEC_IMPL_TAG89 88
+#define THRUST_PP_DEC_IMPL_TAG90 89
+#define THRUST_PP_DEC_IMPL_TAG91 90
+#define THRUST_PP_DEC_IMPL_TAG92 91
+#define THRUST_PP_DEC_IMPL_TAG93 92
+#define THRUST_PP_DEC_IMPL_TAG94 93
+#define THRUST_PP_DEC_IMPL_TAG95 94
+#define THRUST_PP_DEC_IMPL_TAG96 95
+#define THRUST_PP_DEC_IMPL_TAG97 96
+#define THRUST_PP_DEC_IMPL_TAG98 97
+#define THRUST_PP_DEC_IMPL_TAG99 98
+#define THRUST_PP_DEC_IMPL_TAG100 99
+#define THRUST_PP_DEC_IMPL_TAG101 100
+#define THRUST_PP_DEC_IMPL_TAG102 101
+#define THRUST_PP_DEC_IMPL_TAG103 102
+#define THRUST_PP_DEC_IMPL_TAG104 103
+#define THRUST_PP_DEC_IMPL_TAG105 104
+#define THRUST_PP_DEC_IMPL_TAG106 105
+#define THRUST_PP_DEC_IMPL_TAG107 106
+#define THRUST_PP_DEC_IMPL_TAG108 107
+#define THRUST_PP_DEC_IMPL_TAG109 108
+#define THRUST_PP_DEC_IMPL_TAG110 109
+#define THRUST_PP_DEC_IMPL_TAG111 110
+#define THRUST_PP_DEC_IMPL_TAG112 111
+#define THRUST_PP_DEC_IMPL_TAG113 112
+#define THRUST_PP_DEC_IMPL_TAG114 113
+#define THRUST_PP_DEC_IMPL_TAG115 114
+#define THRUST_PP_DEC_IMPL_TAG116 115
+#define THRUST_PP_DEC_IMPL_TAG117 116
+#define THRUST_PP_DEC_IMPL_TAG118 117
+#define THRUST_PP_DEC_IMPL_TAG119 118
+#define THRUST_PP_DEC_IMPL_TAG120 119
+#define THRUST_PP_DEC_IMPL_TAG121 120
+#define THRUST_PP_DEC_IMPL_TAG122 121
+#define THRUST_PP_DEC_IMPL_TAG123 122
+#define THRUST_PP_DEC_IMPL_TAG124 123
+#define THRUST_PP_DEC_IMPL_TAG125 124
+#define THRUST_PP_DEC_IMPL_TAG126 125
+#define THRUST_PP_DEC_IMPL_TAG127 126
+#define THRUST_PP_DEC_IMPL_TAG128 127
+#define THRUST_PP_DEC_IMPL_TAG129 128
+#define THRUST_PP_DEC_IMPL_TAG130 129
+#define THRUST_PP_DEC_IMPL_TAG131 130
+#define THRUST_PP_DEC_IMPL_TAG132 131
+#define THRUST_PP_DEC_IMPL_TAG133 132
+#define THRUST_PP_DEC_IMPL_TAG134 133
+#define THRUST_PP_DEC_IMPL_TAG135 134
+#define THRUST_PP_DEC_IMPL_TAG136 135
+#define THRUST_PP_DEC_IMPL_TAG137 136
+#define THRUST_PP_DEC_IMPL_TAG138 137
+#define THRUST_PP_DEC_IMPL_TAG139 138
+#define THRUST_PP_DEC_IMPL_TAG140 139
+#define THRUST_PP_DEC_IMPL_TAG141 140
+#define THRUST_PP_DEC_IMPL_TAG142 141
+#define THRUST_PP_DEC_IMPL_TAG143 142
+#define THRUST_PP_DEC_IMPL_TAG144 143
+#define THRUST_PP_DEC_IMPL_TAG145 144
+#define THRUST_PP_DEC_IMPL_TAG146 145
+#define THRUST_PP_DEC_IMPL_TAG147 146
+#define THRUST_PP_DEC_IMPL_TAG148 147
+#define THRUST_PP_DEC_IMPL_TAG149 148
+#define THRUST_PP_DEC_IMPL_TAG150 149
+#define THRUST_PP_DEC_IMPL_TAG151 150
+#define THRUST_PP_DEC_IMPL_TAG152 151
+#define THRUST_PP_DEC_IMPL_TAG153 152
+#define THRUST_PP_DEC_IMPL_TAG154 153
+#define THRUST_PP_DEC_IMPL_TAG155 154
+#define THRUST_PP_DEC_IMPL_TAG156 155
+#define THRUST_PP_DEC_IMPL_TAG157 156
+#define THRUST_PP_DEC_IMPL_TAG158 157
+#define THRUST_PP_DEC_IMPL_TAG159 158
+#define THRUST_PP_DEC_IMPL_TAG160 159
+#define THRUST_PP_DEC_IMPL_TAG161 160
+#define THRUST_PP_DEC_IMPL_TAG162 161
+#define THRUST_PP_DEC_IMPL_TAG163 162
+#define THRUST_PP_DEC_IMPL_TAG164 163
+#define THRUST_PP_DEC_IMPL_TAG165 164
+#define THRUST_PP_DEC_IMPL_TAG166 165
+#define THRUST_PP_DEC_IMPL_TAG167 166
+#define THRUST_PP_DEC_IMPL_TAG168 167
+#define THRUST_PP_DEC_IMPL_TAG169 168
+#define THRUST_PP_DEC_IMPL_TAG170 169
+#define THRUST_PP_DEC_IMPL_TAG171 170
+#define THRUST_PP_DEC_IMPL_TAG172 171
+#define THRUST_PP_DEC_IMPL_TAG173 172
+#define THRUST_PP_DEC_IMPL_TAG174 173
+#define THRUST_PP_DEC_IMPL_TAG175 174
+#define THRUST_PP_DEC_IMPL_TAG176 175
+#define THRUST_PP_DEC_IMPL_TAG177 176
+#define THRUST_PP_DEC_IMPL_TAG178 177
+#define THRUST_PP_DEC_IMPL_TAG179 178
+#define THRUST_PP_DEC_IMPL_TAG180 179
+#define THRUST_PP_DEC_IMPL_TAG181 180
+#define THRUST_PP_DEC_IMPL_TAG182 181
+#define THRUST_PP_DEC_IMPL_TAG183 182
+#define THRUST_PP_DEC_IMPL_TAG184 183
+#define THRUST_PP_DEC_IMPL_TAG185 184
+#define THRUST_PP_DEC_IMPL_TAG186 185
+#define THRUST_PP_DEC_IMPL_TAG187 186
+#define THRUST_PP_DEC_IMPL_TAG188 187
+#define THRUST_PP_DEC_IMPL_TAG189 188
+#define THRUST_PP_DEC_IMPL_TAG190 189
+#define THRUST_PP_DEC_IMPL_TAG191 190
+#define THRUST_PP_DEC_IMPL_TAG192 191
+#define THRUST_PP_DEC_IMPL_TAG193 192
+#define THRUST_PP_DEC_IMPL_TAG194 193
+#define THRUST_PP_DEC_IMPL_TAG195 194
+#define THRUST_PP_DEC_IMPL_TAG196 195
+#define THRUST_PP_DEC_IMPL_TAG197 196
+#define THRUST_PP_DEC_IMPL_TAG198 197
+#define THRUST_PP_DEC_IMPL_TAG199 198
+#define THRUST_PP_DEC_IMPL_TAG200 199
+#define THRUST_PP_DEC_IMPL_TAG201 200
+#define THRUST_PP_DEC_IMPL_TAG202 201
+#define THRUST_PP_DEC_IMPL_TAG203 202
+#define THRUST_PP_DEC_IMPL_TAG204 203
+#define THRUST_PP_DEC_IMPL_TAG205 204
+#define THRUST_PP_DEC_IMPL_TAG206 205
+#define THRUST_PP_DEC_IMPL_TAG207 206
+#define THRUST_PP_DEC_IMPL_TAG208 207
+#define THRUST_PP_DEC_IMPL_TAG209 208
+#define THRUST_PP_DEC_IMPL_TAG210 209
+#define THRUST_PP_DEC_IMPL_TAG211 210
+#define THRUST_PP_DEC_IMPL_TAG212 211
+#define THRUST_PP_DEC_IMPL_TAG213 212
+#define THRUST_PP_DEC_IMPL_TAG214 213
+#define THRUST_PP_DEC_IMPL_TAG215 214
+#define THRUST_PP_DEC_IMPL_TAG216 215
+#define THRUST_PP_DEC_IMPL_TAG217 216
+#define THRUST_PP_DEC_IMPL_TAG218 217
+#define THRUST_PP_DEC_IMPL_TAG219 218
+#define THRUST_PP_DEC_IMPL_TAG220 219
+#define THRUST_PP_DEC_IMPL_TAG221 220
+#define THRUST_PP_DEC_IMPL_TAG222 221
+#define THRUST_PP_DEC_IMPL_TAG223 222
+#define THRUST_PP_DEC_IMPL_TAG224 223
+#define THRUST_PP_DEC_IMPL_TAG225 224
+#define THRUST_PP_DEC_IMPL_TAG226 225
+#define THRUST_PP_DEC_IMPL_TAG227 226
+#define THRUST_PP_DEC_IMPL_TAG228 227
+#define THRUST_PP_DEC_IMPL_TAG229 228
+#define THRUST_PP_DEC_IMPL_TAG230 229
+#define THRUST_PP_DEC_IMPL_TAG231 230
+#define THRUST_PP_DEC_IMPL_TAG232 231
+#define THRUST_PP_DEC_IMPL_TAG233 232
+#define THRUST_PP_DEC_IMPL_TAG234 233
+#define THRUST_PP_DEC_IMPL_TAG235 234
+#define THRUST_PP_DEC_IMPL_TAG236 235
+#define THRUST_PP_DEC_IMPL_TAG237 236
+#define THRUST_PP_DEC_IMPL_TAG238 237
+#define THRUST_PP_DEC_IMPL_TAG239 238
+#define THRUST_PP_DEC_IMPL_TAG240 239
+#define THRUST_PP_DEC_IMPL_TAG241 240
+#define THRUST_PP_DEC_IMPL_TAG242 241
+#define THRUST_PP_DEC_IMPL_TAG243 242
+#define THRUST_PP_DEC_IMPL_TAG244 243
+#define THRUST_PP_DEC_IMPL_TAG245 244
+#define THRUST_PP_DEC_IMPL_TAG246 245
+#define THRUST_PP_DEC_IMPL_TAG247 246
+#define THRUST_PP_DEC_IMPL_TAG248 247
+#define THRUST_PP_DEC_IMPL_TAG249 248
+#define THRUST_PP_DEC_IMPL_TAG250 249
+#define THRUST_PP_DEC_IMPL_TAG251 250
+#define THRUST_PP_DEC_IMPL_TAG252 251
+#define THRUST_PP_DEC_IMPL_TAG253 252
+#define THRUST_PP_DEC_IMPL_TAG254 253
+#define THRUST_PP_DEC_IMPL_TAG255 254
+#define THRUST_PP_DEC_IMPL_TAG256 255
+#define THRUST_PP_DEC_IMPL_TAG257 256
+
+#define THRUST_PP_BOOL(x) THRUST_PP_BOOL_IMPL0(x)
+
+#define THRUST_PP_BOOL_IMPL0(x) THRUST_PP_CAT2(THRUST_PP_BOOL_IMPL_TAG, x)
+
+#define THRUST_PP_BOOL_IMPL_TAG0 0
+#define THRUST_PP_BOOL_IMPL_TAG1 1
+#define THRUST_PP_BOOL_IMPL_TAG2 1
+#define THRUST_PP_BOOL_IMPL_TAG3 1
+#define THRUST_PP_BOOL_IMPL_TAG4 1
+#define THRUST_PP_BOOL_IMPL_TAG5 1
+#define THRUST_PP_BOOL_IMPL_TAG6 1
+#define THRUST_PP_BOOL_IMPL_TAG7 1
+#define THRUST_PP_BOOL_IMPL_TAG8 1
+#define THRUST_PP_BOOL_IMPL_TAG9 1
+#define THRUST_PP_BOOL_IMPL_TAG10 1
+#define THRUST_PP_BOOL_IMPL_TAG11 1
+#define THRUST_PP_BOOL_IMPL_TAG12 1
+#define THRUST_PP_BOOL_IMPL_TAG13 1
+#define THRUST_PP_BOOL_IMPL_TAG14 1
+#define THRUST_PP_BOOL_IMPL_TAG15 1
+#define THRUST_PP_BOOL_IMPL_TAG16 1
+#define THRUST_PP_BOOL_IMPL_TAG17 1
+#define THRUST_PP_BOOL_IMPL_TAG18 1
+#define THRUST_PP_BOOL_IMPL_TAG19 1
+#define THRUST_PP_BOOL_IMPL_TAG20 1
+#define THRUST_PP_BOOL_IMPL_TAG21 1
+#define THRUST_PP_BOOL_IMPL_TAG22 1
+#define THRUST_PP_BOOL_IMPL_TAG23 1
+#define THRUST_PP_BOOL_IMPL_TAG24 1
+#define THRUST_PP_BOOL_IMPL_TAG25 1
+#define THRUST_PP_BOOL_IMPL_TAG26 1
+#define THRUST_PP_BOOL_IMPL_TAG27 1
+#define THRUST_PP_BOOL_IMPL_TAG28 1
+#define THRUST_PP_BOOL_IMPL_TAG29 1
+#define THRUST_PP_BOOL_IMPL_TAG30 1
+#define THRUST_PP_BOOL_IMPL_TAG31 1
+#define THRUST_PP_BOOL_IMPL_TAG32 1
+#define THRUST_PP_BOOL_IMPL_TAG33 1
+#define THRUST_PP_BOOL_IMPL_TAG34 1
+#define THRUST_PP_BOOL_IMPL_TAG35 1
+#define THRUST_PP_BOOL_IMPL_TAG36 1
+#define THRUST_PP_BOOL_IMPL_TAG37 1
+#define THRUST_PP_BOOL_IMPL_TAG38 1
+#define THRUST_PP_BOOL_IMPL_TAG39 1
+#define THRUST_PP_BOOL_IMPL_TAG40 1
+#define THRUST_PP_BOOL_IMPL_TAG41 1
+#define THRUST_PP_BOOL_IMPL_TAG42 1
+#define THRUST_PP_BOOL_IMPL_TAG43 1
+#define THRUST_PP_BOOL_IMPL_TAG44 1
+#define THRUST_PP_BOOL_IMPL_TAG45 1
+#define THRUST_PP_BOOL_IMPL_TAG46 1
+#define THRUST_PP_BOOL_IMPL_TAG47 1
+#define THRUST_PP_BOOL_IMPL_TAG48 1
+#define THRUST_PP_BOOL_IMPL_TAG49 1
+#define THRUST_PP_BOOL_IMPL_TAG50 1
+#define THRUST_PP_BOOL_IMPL_TAG51 1
+#define THRUST_PP_BOOL_IMPL_TAG52 1
+#define THRUST_PP_BOOL_IMPL_TAG53 1
+#define THRUST_PP_BOOL_IMPL_TAG54 1
+#define THRUST_PP_BOOL_IMPL_TAG55 1
+#define THRUST_PP_BOOL_IMPL_TAG56 1
+#define THRUST_PP_BOOL_IMPL_TAG57 1
+#define THRUST_PP_BOOL_IMPL_TAG58 1
+#define THRUST_PP_BOOL_IMPL_TAG59 1
+#define THRUST_PP_BOOL_IMPL_TAG60 1
+#define THRUST_PP_BOOL_IMPL_TAG61 1
+#define THRUST_PP_BOOL_IMPL_TAG62 1
+#define THRUST_PP_BOOL_IMPL_TAG63 1
+#define THRUST_PP_BOOL_IMPL_TAG64 1
+#define THRUST_PP_BOOL_IMPL_TAG65 1
+#define THRUST_PP_BOOL_IMPL_TAG66 1
+#define THRUST_PP_BOOL_IMPL_TAG67 1
+#define THRUST_PP_BOOL_IMPL_TAG68 1
+#define THRUST_PP_BOOL_IMPL_TAG69 1
+#define THRUST_PP_BOOL_IMPL_TAG70 1
+#define THRUST_PP_BOOL_IMPL_TAG71 1
+#define THRUST_PP_BOOL_IMPL_TAG72 1
+#define THRUST_PP_BOOL_IMPL_TAG73 1
+#define THRUST_PP_BOOL_IMPL_TAG74 1
+#define THRUST_PP_BOOL_IMPL_TAG75 1
+#define THRUST_PP_BOOL_IMPL_TAG76 1
+#define THRUST_PP_BOOL_IMPL_TAG77 1
+#define THRUST_PP_BOOL_IMPL_TAG78 1
+#define THRUST_PP_BOOL_IMPL_TAG79 1
+#define THRUST_PP_BOOL_IMPL_TAG80 1
+#define THRUST_PP_BOOL_IMPL_TAG81 1
+#define THRUST_PP_BOOL_IMPL_TAG82 1
+#define THRUST_PP_BOOL_IMPL_TAG83 1
+#define THRUST_PP_BOOL_IMPL_TAG84 1
+#define THRUST_PP_BOOL_IMPL_TAG85 1
+#define THRUST_PP_BOOL_IMPL_TAG86 1
+#define THRUST_PP_BOOL_IMPL_TAG87 1
+#define THRUST_PP_BOOL_IMPL_TAG88 1
+#define THRUST_PP_BOOL_IMPL_TAG89 1
+#define THRUST_PP_BOOL_IMPL_TAG90 1
+#define THRUST_PP_BOOL_IMPL_TAG91 1
+#define THRUST_PP_BOOL_IMPL_TAG92 1
+#define THRUST_PP_BOOL_IMPL_TAG93 1
+#define THRUST_PP_BOOL_IMPL_TAG94 1
+#define THRUST_PP_BOOL_IMPL_TAG95 1
+#define THRUST_PP_BOOL_IMPL_TAG96 1
+#define THRUST_PP_BOOL_IMPL_TAG97 1
+#define THRUST_PP_BOOL_IMPL_TAG98 1
+#define THRUST_PP_BOOL_IMPL_TAG99 1
+#define THRUST_PP_BOOL_IMPL_TAG100 1
+#define THRUST_PP_BOOL_IMPL_TAG101 1
+#define THRUST_PP_BOOL_IMPL_TAG102 1
+#define THRUST_PP_BOOL_IMPL_TAG103 1
+#define THRUST_PP_BOOL_IMPL_TAG104 1
+#define THRUST_PP_BOOL_IMPL_TAG105 1
+#define THRUST_PP_BOOL_IMPL_TAG106 1
+#define THRUST_PP_BOOL_IMPL_TAG107 1
+#define THRUST_PP_BOOL_IMPL_TAG108 1
+#define THRUST_PP_BOOL_IMPL_TAG109 1
+#define THRUST_PP_BOOL_IMPL_TAG110 1
+#define THRUST_PP_BOOL_IMPL_TAG111 1
+#define THRUST_PP_BOOL_IMPL_TAG112 1
+#define THRUST_PP_BOOL_IMPL_TAG113 1
+#define THRUST_PP_BOOL_IMPL_TAG114 1
+#define THRUST_PP_BOOL_IMPL_TAG115 1
+#define THRUST_PP_BOOL_IMPL_TAG116 1
+#define THRUST_PP_BOOL_IMPL_TAG117 1
+#define THRUST_PP_BOOL_IMPL_TAG118 1
+#define THRUST_PP_BOOL_IMPL_TAG119 1
+#define THRUST_PP_BOOL_IMPL_TAG120 1
+#define THRUST_PP_BOOL_IMPL_TAG121 1
+#define THRUST_PP_BOOL_IMPL_TAG122 1
+#define THRUST_PP_BOOL_IMPL_TAG123 1
+#define THRUST_PP_BOOL_IMPL_TAG124 1
+#define THRUST_PP_BOOL_IMPL_TAG125 1
+#define THRUST_PP_BOOL_IMPL_TAG126 1
+#define THRUST_PP_BOOL_IMPL_TAG127 1
+#define THRUST_PP_BOOL_IMPL_TAG128 1
+#define THRUST_PP_BOOL_IMPL_TAG129 1
+#define THRUST_PP_BOOL_IMPL_TAG130 1
+#define THRUST_PP_BOOL_IMPL_TAG131 1
+#define THRUST_PP_BOOL_IMPL_TAG132 1
+#define THRUST_PP_BOOL_IMPL_TAG133 1
+#define THRUST_PP_BOOL_IMPL_TAG134 1
+#define THRUST_PP_BOOL_IMPL_TAG135 1
+#define THRUST_PP_BOOL_IMPL_TAG136 1
+#define THRUST_PP_BOOL_IMPL_TAG137 1
+#define THRUST_PP_BOOL_IMPL_TAG138 1
+#define THRUST_PP_BOOL_IMPL_TAG139 1
+#define THRUST_PP_BOOL_IMPL_TAG140 1
+#define THRUST_PP_BOOL_IMPL_TAG141 1
+#define THRUST_PP_BOOL_IMPL_TAG142 1
+#define THRUST_PP_BOOL_IMPL_TAG143 1
+#define THRUST_PP_BOOL_IMPL_TAG144 1
+#define THRUST_PP_BOOL_IMPL_TAG145 1
+#define THRUST_PP_BOOL_IMPL_TAG146 1
+#define THRUST_PP_BOOL_IMPL_TAG147 1
+#define THRUST_PP_BOOL_IMPL_TAG148 1
+#define THRUST_PP_BOOL_IMPL_TAG149 1
+#define THRUST_PP_BOOL_IMPL_TAG150 1
+#define THRUST_PP_BOOL_IMPL_TAG151 1
+#define THRUST_PP_BOOL_IMPL_TAG152 1
+#define THRUST_PP_BOOL_IMPL_TAG153 1
+#define THRUST_PP_BOOL_IMPL_TAG154 1
+#define THRUST_PP_BOOL_IMPL_TAG155 1
+#define THRUST_PP_BOOL_IMPL_TAG156 1
+#define THRUST_PP_BOOL_IMPL_TAG157 1
+#define THRUST_PP_BOOL_IMPL_TAG158 1
+#define THRUST_PP_BOOL_IMPL_TAG159 1
+#define THRUST_PP_BOOL_IMPL_TAG160 1
+#define THRUST_PP_BOOL_IMPL_TAG161 1
+#define THRUST_PP_BOOL_IMPL_TAG162 1
+#define THRUST_PP_BOOL_IMPL_TAG163 1
+#define THRUST_PP_BOOL_IMPL_TAG164 1
+#define THRUST_PP_BOOL_IMPL_TAG165 1
+#define THRUST_PP_BOOL_IMPL_TAG166 1
+#define THRUST_PP_BOOL_IMPL_TAG167 1
+#define THRUST_PP_BOOL_IMPL_TAG168 1
+#define THRUST_PP_BOOL_IMPL_TAG169 1
+#define THRUST_PP_BOOL_IMPL_TAG170 1
+#define THRUST_PP_BOOL_IMPL_TAG171 1
+#define THRUST_PP_BOOL_IMPL_TAG172 1
+#define THRUST_PP_BOOL_IMPL_TAG173 1
+#define THRUST_PP_BOOL_IMPL_TAG174 1
+#define THRUST_PP_BOOL_IMPL_TAG175 1
+#define THRUST_PP_BOOL_IMPL_TAG176 1
+#define THRUST_PP_BOOL_IMPL_TAG177 1
+#define THRUST_PP_BOOL_IMPL_TAG178 1
+#define THRUST_PP_BOOL_IMPL_TAG179 1
+#define THRUST_PP_BOOL_IMPL_TAG180 1
+#define THRUST_PP_BOOL_IMPL_TAG181 1
+#define THRUST_PP_BOOL_IMPL_TAG182 1
+#define THRUST_PP_BOOL_IMPL_TAG183 1
+#define THRUST_PP_BOOL_IMPL_TAG184 1
+#define THRUST_PP_BOOL_IMPL_TAG185 1
+#define THRUST_PP_BOOL_IMPL_TAG186 1
+#define THRUST_PP_BOOL_IMPL_TAG187 1
+#define THRUST_PP_BOOL_IMPL_TAG188 1
+#define THRUST_PP_BOOL_IMPL_TAG189 1
+#define THRUST_PP_BOOL_IMPL_TAG190 1
+#define THRUST_PP_BOOL_IMPL_TAG191 1
+#define THRUST_PP_BOOL_IMPL_TAG192 1
+#define THRUST_PP_BOOL_IMPL_TAG193 1
+#define THRUST_PP_BOOL_IMPL_TAG194 1
+#define THRUST_PP_BOOL_IMPL_TAG195 1
+#define THRUST_PP_BOOL_IMPL_TAG196 1
+#define THRUST_PP_BOOL_IMPL_TAG197 1
+#define THRUST_PP_BOOL_IMPL_TAG198 1
+#define THRUST_PP_BOOL_IMPL_TAG199 1
+#define THRUST_PP_BOOL_IMPL_TAG200 1
+#define THRUST_PP_BOOL_IMPL_TAG201 1
+#define THRUST_PP_BOOL_IMPL_TAG202 1
+#define THRUST_PP_BOOL_IMPL_TAG203 1
+#define THRUST_PP_BOOL_IMPL_TAG204 1
+#define THRUST_PP_BOOL_IMPL_TAG205 1
+#define THRUST_PP_BOOL_IMPL_TAG206 1
+#define THRUST_PP_BOOL_IMPL_TAG207 1
+#define THRUST_PP_BOOL_IMPL_TAG208 1
+#define THRUST_PP_BOOL_IMPL_TAG209 1
+#define THRUST_PP_BOOL_IMPL_TAG210 1
+#define THRUST_PP_BOOL_IMPL_TAG211 1
+#define THRUST_PP_BOOL_IMPL_TAG212 1
+#define THRUST_PP_BOOL_IMPL_TAG213 1
+#define THRUST_PP_BOOL_IMPL_TAG214 1
+#define THRUST_PP_BOOL_IMPL_TAG215 1
+#define THRUST_PP_BOOL_IMPL_TAG216 1
+#define THRUST_PP_BOOL_IMPL_TAG217 1
+#define THRUST_PP_BOOL_IMPL_TAG218 1
+#define THRUST_PP_BOOL_IMPL_TAG219 1
+#define THRUST_PP_BOOL_IMPL_TAG220 1
+#define THRUST_PP_BOOL_IMPL_TAG221 1
+#define THRUST_PP_BOOL_IMPL_TAG222 1
+#define THRUST_PP_BOOL_IMPL_TAG223 1
+#define THRUST_PP_BOOL_IMPL_TAG224 1
+#define THRUST_PP_BOOL_IMPL_TAG225 1
+#define THRUST_PP_BOOL_IMPL_TAG226 1
+#define THRUST_PP_BOOL_IMPL_TAG227 1
+#define THRUST_PP_BOOL_IMPL_TAG228 1
+#define THRUST_PP_BOOL_IMPL_TAG229 1
+#define THRUST_PP_BOOL_IMPL_TAG230 1
+#define THRUST_PP_BOOL_IMPL_TAG231 1
+#define THRUST_PP_BOOL_IMPL_TAG232 1
+#define THRUST_PP_BOOL_IMPL_TAG233 1
+#define THRUST_PP_BOOL_IMPL_TAG234 1
+#define THRUST_PP_BOOL_IMPL_TAG235 1
+#define THRUST_PP_BOOL_IMPL_TAG236 1
+#define THRUST_PP_BOOL_IMPL_TAG237 1
+#define THRUST_PP_BOOL_IMPL_TAG238 1
+#define THRUST_PP_BOOL_IMPL_TAG239 1
+#define THRUST_PP_BOOL_IMPL_TAG240 1
+#define THRUST_PP_BOOL_IMPL_TAG241 1
+#define THRUST_PP_BOOL_IMPL_TAG242 1
+#define THRUST_PP_BOOL_IMPL_TAG243 1
+#define THRUST_PP_BOOL_IMPL_TAG244 1
+#define THRUST_PP_BOOL_IMPL_TAG245 1
+#define THRUST_PP_BOOL_IMPL_TAG246 1
+#define THRUST_PP_BOOL_IMPL_TAG247 1
+#define THRUST_PP_BOOL_IMPL_TAG248 1
+#define THRUST_PP_BOOL_IMPL_TAG249 1
+#define THRUST_PP_BOOL_IMPL_TAG250 1
+#define THRUST_PP_BOOL_IMPL_TAG251 1
+#define THRUST_PP_BOOL_IMPL_TAG252 1
+#define THRUST_PP_BOOL_IMPL_TAG253 1
+#define THRUST_PP_BOOL_IMPL_TAG254 1
+#define THRUST_PP_BOOL_IMPL_TAG255 1
+#define THRUST_PP_BOOL_IMPL_TAG256 1
+
+///////////////////////////////////////////////////////////////////////////////
+
+#define THRUST_PP_IIF(bit, t, f) THRUST_PP_IIF_IMPL0(bit, t, f)
+
+#if defined(_MSC_VER)
+  #define THRUST_PP_IIF_IMPL0(bit, t, f)                                      \
+    THRUST_PP_IIF_IMPL1(THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f)))    \
+    /**/
+  #define THRUST_PP_IIF_IMPL1(id) id
+#else
+  #define THRUST_PP_IIF_IMPL0(bit, t, f)                                      \
+    THRUST_PP_CAT2(THRUST_PP_IIF_IMPL_TAG, bit(t, f))
+    /**/
+#endif
+
+#define THRUST_PP_IIF_IMPL_TAG0(t, f) f
+#define THRUST_PP_IIF_IMPL_TAG1(t, f) t
+
+#if defined(__EDG__)
+  #define THRUST_PP_IF(cond, t, f) THRUST_PP_IF_IMPL0(cond, t, f)
+  #define THRUST_PP_IF_IMPL0(cond, t, f)                                      \
+    THRUST_PP_IIF(THRUST_PP_BOOL(cond), t, f)                                 \
+    /**/
+#else
+  #define THRUST_PP_IF(cond, t, f) THRUST_PP_IIF(THRUST_PP_BOOL(cond), t, f)
+#endif
+
+/// \def THRUST_COMMA_IF(cond)
+/// \brief If \a cond is true, expands to a comma. Otherwise, expands to nothing.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_STRINGIZE(THRUST_COMMA_IF(0)) << "\n"
+///             << THRUST_PP_STRINGIZE(THRUST_COMMA_IF(1)) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << ""  << "\n"
+///             << "," << "\n";
+/// }
+/// \endcode
+///
+#if defined(__EDG__)
+  #define THRUST_PP_COMMA_IF(cond) THRUST_PP_COMMA_IF_IMPL0(cond)
+  #define THRUST_PP_COMMA_IF_IMPL0(cond)                                      \
+    THRUST_PP_IF(cond, THRUST_PP_COMMA, THRUST_PP_EMPTY)()                    \
+    /**/
+#else
+  #define THRUST_PP_COMMA_IF(cond)                                            \
+    THRUST_PP_IF(cond, THRUST_PP_COMMA, THRUST_PP_EMPTY)()                    \
+    /**/
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+// http://gustedt.wordpress.com/2010/06/08/detect-empty-macro-arguments
+
+#define THRUST_PP_64TH_ARG(                                                   \
+     _1, _2, _3, _4, _5, _6, _7, _8, _9,_10,_11,_12,_13,_14,_15,_16           \
+  , _17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32           \
+  , _33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48           \
+  , _49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63,  N           \
+  , ...                                                                       \
+  ) N                                                                         \
+  /**/
+
+#define THRUST_PP_HAS_COMMA(...)                                              \
+  THRUST_PP_EXPAND(THRUST_PP_64TH_ARG(                                        \
+    __VA_ARGS__                                                               \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1                                           \
+  , 1,1,1,1,1,1,1,1,1,1,1,1,1,1,0                                             \
+  ))                                                                          \
+  /**/
+
+#define THRUST_PP_TRIGGER_PAREN(...) ,
+
+#define THRUST_PP_IS_VARIADIC_NULLARY(...)                                    \
+  THRUST_PP_IS_VARIADIC_NULLARY_IMPL0(                                        \
+    /* Test if there is just one argument, eventually an empty one. */        \
+    THRUST_PP_HAS_COMMA(__VA_ARGS__),                                         \
+    /* Test if THRUST_PP_TRIGGER_PAREN together with the argument adds a */   \
+    /* comma. */                                                              \
+    THRUST_PP_HAS_COMMA(THRUST_PP_TRIGGER_PAREN __VA_ARGS__),                 \
+    /* Test if the argument together with a parenthesis adds a comma. */      \
+    THRUST_PP_HAS_COMMA(__VA_ARGS__ (/*empty*/)),                             \
+    /* Test if placing it between THRUST_PP_TRIGGER_PAREN and the */          \
+    /* parenthesis adds a comma. */                                           \
+    THRUST_PP_HAS_COMMA(THRUST_PP_TRIGGER_PAREN __VA_ARGS__ (/*empty*/))      \
+  )                                                                           \
+  /**/
+
+#define THRUST_PP_IS_VARIADIC_NULLARY_IMPL0(_0, _1, _2, _3)                   \
+  THRUST_PP_HAS_COMMA(                                                        \
+    THRUST_PP_CAT5(THRUST_PP_IS_VARIADIC_NULLARY_IMPL_TAG, _0, _1, _2, _3)    \
+  )                                                                           \
+
+#define THRUST_PP_IS_VARIADIC_NULLARY_IMPL_TAG0001 ,
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_PP_ARITY(...)
+/// \brief Returns the number of arguments that it was called with. Must be
+///        called with less than 64 arguments.
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << THRUST_PP_ARITY()        << "\n"
+///             << THRUST_PP_ARITY(x)       << "\n"
+///             << THRUST_PP_ARITY(x, y)    << "\n"
+///             << THRUST_PP_ARITY(x, y, z) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 0 << "\n"
+///             << 1 << "\n"
+///             << 2 << "\n"
+///             << 3 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_ARITY(...)                                                  \
+  THRUST_PP_EXPAND(                                                           \
+    THRUST_PP_IF(                                                             \
+      THRUST_PP_IS_VARIADIC_NULLARY(__VA_ARGS__)                              \
+    , 0                                                                       \
+    , THRUST_PP_64TH_ARG(                                                     \
+        __VA_ARGS__                                                           \
+      , 63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48                       \
+      , 47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32                       \
+      , 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16                       \
+      , 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0                       \
+      )                                                                       \
+    )                                                                         \
+  )                                                                           \
+  /**/
+
+/// \def THRUST_PP_DISPATCH(basename, ...)
+/// \brief Expands to <tt>basenameN(...)</tt>, where <tt>N</tt> is the
+///        number of variadic arguments that \a THRUST_PP_DISPATCH was called
+///        with. This macro can be used to implement "macro overloading".
+///
+/// \par <b>Example</b>:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// #define PLUS(...) THRUST_PP_DISPATCH(PLUS, __VA_ARGS__)
+/// #define PLUS0()        0
+/// #define PLUS1(x)       x
+/// #define PLUS2(x, y)    x + y
+/// #define PLUS3(x, y, z) x + y + z
+///
+/// int main()
+/// {
+///   std::cout << PLUS()        << "\n"
+///             << PLUS(1)       << "\n"
+///             << PLUS(1, 2)    << "\n"
+///             << PLUS(1, 2, 3) << "\n";
+/// }
+/// \endcode
+///
+/// The above code expands to:
+///
+/// \code
+/// #include <thrust/detail/preprocessor.h>
+/// #include <iostream>
+///
+/// int main()
+/// {
+///   std::cout << 0         << "\n"
+///             << 1         << "\n"
+///             << 1 + 2     << "\n"
+///             << 1 + 2 + 3 << "\n";
+/// }
+/// \endcode
+///
+#define THRUST_PP_DISPATCH(basename, ...)                                     \
+  THRUST_PP_EXPAND(                                                           \
+    THRUST_PP_CAT2(                                                           \
+      basename,                                                               \
+      THRUST_PP_ARITY(__VA_ARGS__)                                            \
+    )(__VA_ARGS__)                                                            \
+  )                                                                           \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_CURRENT_FUNCTION
+/// \brief The name of the current function as a string.
+///
+#if    defined(__GNUC__)                                                      \
+    || (defined(__MWERKS__) && (__MWERKS__ >= 0x3000))                        \
+    || (defined(__ICC) && (__ICC >= 600)) || defined(__ghs__)
+  #define THRUST_CURRENT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(__DMC__) && (__DMC__ >= 0x810)
+  #define THRUST_CURRENT_FUNCTION __PRETTY_FUNCTION__
+#elif defined(__FUNCSIG__)
+  #define THRUST_CURRENT_FUNCTION __FUNCSIG__
+#elif    (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 600))             \
+      || (defined(__IBMCTHRUST_PP__) && (__IBMCTHRUST_PP__ >= 500))
+  #define THRUST_CURRENT_FUNCTION __FUNCTION__
+#elif defined(__BORLANDC__) && (__BORLANDC__ >= 0x550)
+  #define THRUST_CURRENT_FUNCTION __FUNC__
+#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)
+  #define THRUST_CURRENT_FUNCTION __func__
+#elif defined(__cplusplus) && (__cplusplus >= 201103)
+  #define THRUST_CURRENT_FUNCTION __func__
+#else
+  #define THRUST_CURRENT_FUNCTION "(unknown)"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
diff --git a/thrust/detail/range/head_flags.h b/thrust/detail/range/head_flags.h
index b193651cf..b755840c9 100644
--- a/thrust/detail/range/head_flags.h
+++ b/thrust/detail/range/head_flags.h
@@ -24,8 +24,7 @@
 #include <thrust/functional.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -226,5 +225,5 @@ head_flags<RandomAccessIterator>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/range/tail_flags.h b/thrust/detail/range/tail_flags.h
index 32ccb53c6..41ee5dd29 100644
--- a/thrust/detail/range/tail_flags.h
+++ b/thrust/detail/range/tail_flags.h
@@ -23,8 +23,7 @@
 #include <thrust/tuple.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -130,5 +129,5 @@ tail_flags<RandomAccessIterator>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/raw_pointer_cast.h b/thrust/detail/raw_pointer_cast.h
index 5d5f59d81..53a77861e 100644
--- a/thrust/detail/raw_pointer_cast.h
+++ b/thrust/detail/raw_pointer_cast.h
@@ -19,15 +19,32 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename Pointer>
-  inline __host__ __device__ typename thrust::detail::pointer_traits<Pointer>::raw_pointer
-    raw_pointer_cast(const Pointer &ptr)
+__host__ __device__
+typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+raw_pointer_cast(Pointer ptr)
 {
   return thrust::detail::pointer_traits<Pointer>::get(ptr);
-} // end raw_pointer_cast()
+}
+
+template <typename ToPointer, typename FromPointer>
+__host__ __device__
+ToPointer
+reinterpret_pointer_cast(FromPointer ptr)
+{
+  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
+  return ToPointer(reinterpret_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+}
 
-} // end thrust
+template <typename ToPointer, typename FromPointer>
+__host__ __device__
+ToPointer
+static_pointer_cast(FromPointer ptr)
+{
+  typedef typename thrust::detail::pointer_element<ToPointer>::type to_element;
+  return ToPointer(static_cast<to_element*>(thrust::raw_pointer_cast(ptr)));
+}
 
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/raw_reference_cast.h b/thrust/detail/raw_reference_cast.h
index 4644a16f2..eff45f0c2 100644
--- a/thrust/detail/raw_reference_cast.h
+++ b/thrust/detail/raw_reference_cast.h
@@ -29,8 +29,7 @@
 // raw_reference_cast depends on metafunctions such as is_unwrappable and raw_reference
 // we need to be sure that these metafunctions are completely defined (including specializations) before they are instantiated by raw_reference_cast
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -48,26 +47,12 @@ template<typename T>
 
 // specialize is_unwrappable
 // a tuple is_unwrappable if any of its elements is_unwrappable
-template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
->
+template<typename... Ts>
   struct is_unwrappable<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
     : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
+        is_unwrappable<Ts>...
       >
 {};
 
@@ -75,25 +60,13 @@ template<
 // specialize is_unwrappable
 // a tuple_of_iterator_references is_unwrappable if any of its elements is_unwrappable
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct is_unwrappable<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
     : or_<
-        is_unwrappable<T0>,
-        is_unwrappable<T1>,
-        is_unwrappable<T2>,
-        is_unwrappable<T3>,
-        is_unwrappable<T4>,
-        is_unwrappable<T5>,
-        is_unwrappable<T6>,
-        is_unwrappable<T7>,
-        is_unwrappable<T8>,
-        is_unwrappable<T9>
+        is_unwrappable<Ts>...
       >
 {};
 
@@ -137,7 +110,7 @@ template<typename T>
 
 
 template<typename T>
-  struct raw_reference : 
+  struct raw_reference :
     raw_reference_detail::raw_reference_impl<T>
 {};
 
@@ -173,51 +146,27 @@ template<typename T>
 
 // recurse on tuples
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference_tuple_helper<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
 {
   typedef thrust::tuple<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
+    typename raw_reference_tuple_helper<Ts>::type...
   > type;
 };
 
 
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference_tuple_helper<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
 {
   typedef thrust::detail::tuple_of_iterator_references<
-    typename raw_reference_tuple_helper<T0>::type,
-    typename raw_reference_tuple_helper<T1>::type,
-    typename raw_reference_tuple_helper<T2>::type,
-    typename raw_reference_tuple_helper<T3>::type,
-    typename raw_reference_tuple_helper<T4>::type,
-    typename raw_reference_tuple_helper<T5>::type,
-    typename raw_reference_tuple_helper<T6>::type,
-    typename raw_reference_tuple_helper<T7>::type,
-    typename raw_reference_tuple_helper<T8>::type,
-    typename raw_reference_tuple_helper<T9>::type
+    typename raw_reference_tuple_helper<Ts>::type...
   > type;
 };
 
@@ -232,17 +181,14 @@ template <
 //   then the raw_reference of tuple_type is a tuple of its members' raw_references
 //   else the raw_reference of tuple_type is tuple_type &
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference<
-    thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::tuple<Ts...>
   >
 {
   private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+    typedef thrust::tuple<Ts...> tuple_type;
 
   public:
     typedef typename eval_if<
@@ -254,17 +200,14 @@ template <
 
 
 template <
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   struct raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >
 {
   private:
-    typedef detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> tuple_type;
+    typedef detail::tuple_of_iterator_references<Ts...> tuple_type;
 
   public:
     typedef typename raw_reference_detail::raw_reference_tuple_helper<tuple_type>::type type;
@@ -283,31 +226,28 @@ template <
 
 // provide declarations of raw_reference_cast's overloads for raw_reference_caster below
 template<typename T>
-inline __host__ __device__
+__host__ __device__
 typename detail::raw_reference<T>::type
   raw_reference_cast(T &ref);
 
 
 template<typename T>
-inline __host__ __device__
+__host__ __device__
 typename detail::raw_reference<const T>::type
   raw_reference_cast(const T &ref);
 
 
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
 __host__ __device__
 typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  thrust::detail::tuple_of_iterator_references<Ts...>,
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
 >::type
-raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t);
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<Ts...> t);
 
 
 namespace detail
@@ -331,18 +271,15 @@ struct raw_reference_caster
   }
 
   template<
-    typename T0, typename T1, typename T2,
-    typename T3, typename T4, typename T5,
-    typename T6, typename T7, typename T8,
-    typename T9
+    typename... Ts
   >
   __host__ __device__
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
-  operator()(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t,
+  operator()(thrust::detail::tuple_of_iterator_references<Ts...> t,
              typename enable_if<
-               is_unwrappable<thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> >::value
+               is_unwrappable<thrust::detail::tuple_of_iterator_references<Ts...> >::value
              >::type * = 0)
   {
     return thrust::raw_reference_cast(t);
@@ -354,7 +291,7 @@ struct raw_reference_caster
 
 
 template<typename T>
-inline __host__ __device__
+__host__ __device__
 typename detail::raw_reference<T>::type
   raw_reference_cast(T &ref)
 {
@@ -363,7 +300,7 @@ typename detail::raw_reference<T>::type
 
 
 template<typename T>
-inline __host__ __device__
+__host__ __device__
 typename detail::raw_reference<const T>::type
   raw_reference_cast(const T &ref)
 {
@@ -372,19 +309,16 @@ typename detail::raw_reference<const T>::type
 
 
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
 __host__ __device__
 typename detail::enable_if_unwrappable<
-  thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>,
+  thrust::detail::tuple_of_iterator_references<Ts...>,
   typename detail::raw_reference<
-    thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    thrust::detail::tuple_of_iterator_references<Ts...>
   >::type
 >::type
-raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> t)
+raw_reference_cast(thrust::detail::tuple_of_iterator_references<Ts...> t)
 {
   thrust::detail::raw_reference_caster f;
 
@@ -394,5 +328,5 @@ raw_reference_cast(thrust::detail::tuple_of_iterator_references<T0,T1,T2,T3,T4,T
 } // end raw_reference_cast
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reduce.inl b/thrust/detail/reduce.inl
index 2ecedc7a2..448a4b38c 100644
--- a/thrust/detail/reduce.inl
+++ b/thrust/detail/reduce.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file reduce.inl
- *  \brief Inline file for reduce.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/reduce.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -27,8 +26,7 @@
 #include <thrust/system/detail/adl/reduce.h>
 #include <thrust/system/detail/adl/reduce_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -81,7 +79,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -102,7 +100,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -125,7 +123,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   reduce_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
+                InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -192,7 +190,7 @@ template<typename InputIterator1,
          typename OutputIterator1,
          typename OutputIterator2>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -220,7 +218,7 @@ template<typename InputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -250,7 +248,7 @@ template<typename InputIterator1,
          typename BinaryPredicate,
          typename BinaryFunction>
   thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(InputIterator1 keys_first, 
+  reduce_by_key(InputIterator1 keys_first,
                 InputIterator1 keys_last,
                 InputIterator2 values_first,
                 OutputIterator1 keys_output,
@@ -274,5 +272,5 @@ template<typename InputIterator1,
 }
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reference.h b/thrust/detail/reference.h
index caf1383cb..5cc13625d 100644
--- a/thrust/detail/reference.h
+++ b/thrust/detail/reference.h
@@ -14,146 +14,505 @@
  *  limitations under the License.
  */
 
+/*! \file 
+ *  \brief A pointer to a variable which resides in memory associated with a
+ *  system.
+ */
+
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/use_default.h>
 #include <thrust/detail/reference_forward_declaration.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/memory.h>
+#include <thrust/system/detail/adl/get_value.h>
+#include <thrust/system/detail/adl/assign_value.h>
+#include <thrust/system/detail/adl/iter_swap.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <type_traits>
 #include <ostream>
 
+THRUST_NAMESPACE_BEGIN
 
-namespace thrust
-{
 namespace detail
 {
-
-template<typename> struct is_wrapped_reference;
-
+template <typename>
+struct is_wrapped_reference;
 }
 
-// the base type for all of thrust's system-annotated references.
-// for reasonable reference-like semantics, derived types must reimplement the following:
-// 1. constructor from pointer
-// 2. copy constructor
-// 3. templated copy constructor from other reference
-// 4. templated assignment from other reference
-// 5. assignment from value_type
-template<typename Element, typename Pointer, typename Derived>
-  class reference
+/*! \p reference acts as a reference-like wrapper for an object residing in
+ *  memory that a \p pointer refers to.
+ */
+template <typename Element, typename Pointer, typename Derived>
+class reference
 {
-  private:
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::is_same<Derived,use_default>::value,
-      thrust::detail::identity_<reference>,
-      thrust::detail::identity_<Derived>
-    >::type derived_type;
-
-    // hint for is_wrapped_reference lets it know that this type (or a derived type)
-    // is a wrapped reference
-    struct wrapped_reference_hint {};
-    template<typename> friend struct thrust::detail::is_wrapped_reference;
-
-  public:
-    typedef Pointer                                              pointer;
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    // XXX this may need an enable_if
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    __host__ __device__
-    pointer operator&() const;
-
-    __host__ __device__
-    operator value_type () const;
-
-    __host__ __device__
-    void swap(derived_type &other);
-
-    derived_type &operator++();
-
-    value_type operator++(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator+=(const value_type &rhs);
-
-    derived_type &operator--();
-
-    value_type operator--(int);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator-=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator*=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator/=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator%=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator<<=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator>>=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator&=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator|=(const value_type &rhs);
-
-    // XXX parameterize the type of rhs
-    derived_type &operator^=(const value_type &rhs);
-
-  private:
-    const pointer m_ptr;
-
-    // allow access to m_ptr for other references
-    template <typename OtherElement, typename OtherPointer, typename OtherDerived> friend class reference;
-
-    template<typename System>
-    __host__ __device__
-    inline value_type strip_const_get_value(const System &system) const;
-
-    template<typename OtherPointer>
-    __host__ __device__
-    inline void assign_from(OtherPointer src);
-
-    template<typename System, typename OtherPointer>
-    __host__ __device__
-    inline void strip_const_assign_value(const System &system, OtherPointer src);
-}; // end reference
+private:
+  using derived_type = typename std::conditional<
+    std::is_same<Derived, use_default>::value, reference, Derived
+  >::type;
+
+public:
+  using pointer    = Pointer;
+  using value_type = typename thrust::remove_cvref<Element>::type;
+
+  reference(reference const&) = default;
+
+  reference(reference&&) = default;
+
+  /*! Construct a \p reference from another \p reference whose pointer type is
+   *  convertible to \p pointer. After this \p reference is constructed, it
+   *  shall refer to the same object as \p other.
+   *
+   *  \tparam OtherElement The element type of the other \p reference.
+   *  \tparam OtherPointer The pointer type of the other \p reference.
+   *  \tparam OtherDerived The derived type of the other \p reference.
+   *  \param  other        A \p reference to copy from.
+   */
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  __host__ __device__
+  reference(
+    reference<OtherElement, OtherPointer, OtherDerived> const& other
+  /*! \cond
+   */
+  , typename std::enable_if<
+      std::is_convertible<
+        typename reference<OtherElement, OtherPointer, OtherDerived>::pointer
+      , pointer
+      >::value
+    >::type* = nullptr
+  /*! \endcond
+   */
+  )
+    : ptr(other.ptr)
+  {}
+
+  /*! Construct a \p reference that refers to an object pointed to by the given
+   *  \p pointer. After this \p reference is constructed, it shall refer to the
+   *  object pointed to by \p ptr.
+   *
+   *  \param ptr A \p pointer to construct from.
+   */
+  __host__ __device__
+  explicit reference(pointer const& p) : ptr(p) {}
+
+  /*! Assign the object referred to \p other to the object referred to by
+   *  this \p reference.
+   *
+   *  \param other The other \p reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  derived_type& operator=(reference const& other)
+  {
+    assign_from(&other);
+    return derived();
+  }
+
+  /*! Assign the object referred to by this \p reference with the object
+   *  referred to by another \p reference whose pointer type is convertible to
+   *  \p pointer.
+   *
+   *  \tparam OtherElement The element type of the other \p reference.
+   *  \tparam OtherPointer The pointer type of the other \p reference.
+   *  \tparam OtherDerived The derived type of the other \p reference.
+   *  \param  other        The other \p reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  __host__ __device__
+  /*! \cond
+   */
+  typename std::enable_if<
+    std::is_convertible<
+      typename reference<OtherElement, OtherPointer, OtherDerived>::pointer
+    , pointer
+    >::value,
+  /*! \endcond
+   */
+    derived_type&
+  /*! \cond
+   */
+  >::type
+  /*! \endcond
+   */
+  operator=(reference<OtherElement, OtherPointer, OtherDerived> const& other)
+  {
+    assign_from(&other);
+    return derived();
+  }
+
+  /*! Assign \p rhs to the object referred to by this \p tagged_reference.
+   *
+   *  \param rhs The \p value_type to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  derived_type& operator=(value_type const& rhs)
+  {
+    assign_from(&rhs);
+    return derived();
+  }
+
+  /*! Exchanges the value of the object referred to by this \p tagged_reference
+   *  with the object referred to by \p other.
+   *
+   *  \param other The \p tagged_reference to swap with.
+   */
+  __host__ __device__
+  void swap(derived_type& other)
+  {
+    // Avoid default-constructing a system; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type* system = nullptr;
+    swap(system, other);
+  }
+
+  __host__ __device__ pointer operator&() const { return ptr; }
+
+  // This is inherently hazardous, as it discards the strong type information
+  // about what system the object is on.
+  __host__ __device__ operator value_type() const
+  {
+    // Avoid default-constructing a system; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type* system = nullptr;
+    return convert_to_value_type(system);
+  }
+
+  __host__ __device__
+  derived_type& operator++()
+  {
+    // Sadly, this has to make a copy. The only mechanism we have for
+    // modifying the value, which may be in memory inaccessible to this
+    // system, is to get a copy of it, modify the copy, and then update it.
+    value_type tmp = *this;
+    ++tmp;
+    *this = tmp;
+    return derived();
+  }
+
+  __host__ __device__
+  value_type operator++(int)
+  {
+    value_type tmp = *this;
+    value_type result = tmp++;
+    *this = std::move(tmp);
+    return result;
+  }
+
+  derived_type& operator--()
+  {
+    // Sadly, this has to make a copy. The only mechanism we have for
+    // modifying the value, which may be in memory inaccessible to this
+    // system, is to get a copy of it, modify the copy, and then update it.
+    value_type tmp = *this;
+    --tmp;
+    *this = std::move(tmp);
+    return derived();
+  }
+
+  value_type operator--(int)
+  {
+    value_type tmp = *this;
+    value_type result = tmp--;
+    *this = std::move(tmp);
+    return derived();
+  }
+
+  __host__ __device__
+  derived_type& operator+=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp += rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator-=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp -= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator*=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp *= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator/=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp /= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator%=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp %= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator<<=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp <<= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator>>=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp >>= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator&=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp &= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator|=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp |= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+  derived_type& operator^=(value_type const& rhs)
+  {
+    value_type tmp = *this;
+    tmp ^= rhs;
+    *this = tmp;
+    return derived();
+  }
+
+private:
+  pointer const ptr;
+
+  // `thrust::detail::is_wrapped_reference` is a trait that indicates whether
+  // a type is a fancy reference. It detects such types by loooking for a
+  // nested `wrapped_reference_hint` type.
+  struct wrapped_reference_hint {};
+  template <typename>
+  friend struct thrust::detail::is_wrapped_reference;
+
+  template <typename OtherElement, typename OtherPointer, typename OtherDerived>
+  friend class reference;
+
+  __host__ __device__
+  derived_type& derived() { return static_cast<derived_type&>(*this); }
+
+  template<typename System>
+  __host__ __device__
+  value_type convert_to_value_type(System* system) const
+  {
+    using thrust::system::detail::generic::select_system;
+    return strip_const_get_value(select_system(*system));
+  }
+
+  template <typename System>
+  __host__ __device__
+  value_type strip_const_get_value(System const& system) const
+  {
+    System &non_const_system = const_cast<System&>(system);
+
+    using thrust::system::detail::generic::get_value;
+    return get_value(thrust::detail::derived_cast(non_const_system), ptr);
+  }
+
+  template <typename System0, typename System1, typename OtherPointer>
+  __host__ __device__
+  void assign_from(System0* system0, System1* system1, OtherPointer src)
+  {
+    using thrust::system::detail::generic::select_system;
+    strip_const_assign_value(select_system(*system0, *system1), src);
+  }
+
+  template <typename OtherPointer>
+  __host__ __device__
+  void assign_from(OtherPointer src)
+  {
+    // Avoid default-constructing systems; instead, just use a null pointer
+    // for dispatch. This assumes that `get_value` will not access any system
+    // state.
+    typename thrust::iterator_system<pointer>::type*      system0 = nullptr;
+    typename thrust::iterator_system<OtherPointer>::type* system1 = nullptr;
+    assign_from(system0, system1, src);
+  }
+
+  template <typename System, typename OtherPointer>
+  __host__ __device__
+  void strip_const_assign_value(System const& system, OtherPointer src)
+  {
+    System& non_const_system = const_cast<System&>(system);
+
+    using thrust::system::detail::generic::assign_value;
+    assign_value(thrust::detail::derived_cast(non_const_system), ptr, src);
+  }
+
+  template <typename System>
+  __host__ __device__
+  void swap(System* system, derived_type& other)
+  {
+    using thrust::system::detail::generic::select_system;
+    using thrust::system::detail::generic::iter_swap;
+
+    iter_swap(select_system(*system, *system), ptr, other.ptr);
+  }
+};
+
+template <typename Pointer, typename Derived>
+class reference<void, Pointer, Derived> {};
+
+template <typename Pointer, typename Derived>
+class reference<void const, Pointer, Derived> {};
+
+template <
+  typename Element, typename Pointer, typename Derived
+, typename CharT, typename Traits
+>
+std::basic_ostream<CharT, Traits>& operator<<(
+  std::basic_ostream<CharT, Traits>&os
+, reference<Element, Pointer, Derived> const& r
+) {
+  using value_type = typename reference<Element, Pointer, Derived>::value_type;
+  return os << static_cast<value_type>(r);
+}
 
-// Output stream operator
-template<typename Element, typename Pointer, typename Derived,
-         typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os,
-           const reference<Element, Pointer, Derived> &y);
+template <typename Element, typename Tag>
+class tagged_reference;
 
-} // end thrust
+/*! \p tagged_reference acts as a reference-like wrapper for an object residing
+ *  in memory associated with system \p Tag that a \p pointer refers to.
+ */
+template <typename Element, typename Tag>
+class tagged_reference
+  : public thrust::reference<
+      Element
+    , thrust::pointer<Element, Tag, tagged_reference<Element, Tag>>
+    , tagged_reference<Element, Tag>
+    >
+{
+private:
+  using base_type = thrust::reference<
+    Element
+  , thrust::pointer<Element, Tag, tagged_reference<Element, Tag>>
+  , tagged_reference<Element, Tag>
+  >;
+
+public:
+  using value_type = typename base_type::value_type;
+  using pointer    = typename base_type::pointer;
+
+  tagged_reference(tagged_reference const&) = default;
+
+  tagged_reference(tagged_reference&&) = default;
+
+  /*! Construct a \p tagged_reference from another \p tagged_reference whose
+   *  pointer type is convertible to \p pointer. After this \p tagged_reference
+   *  is constructed, it shall refer to the same object as \p other.
+   *
+   *  \tparam OtherElement The element type of the other \p tagged_reference.
+   *  \tparam OtherTag     The tag type of the other \p tagged_reference.
+   *  \param  other        A \p tagged_reference to copy from.
+   */
+  template <typename OtherElement, typename OtherTag>
+  __host__ __device__
+  tagged_reference(tagged_reference<OtherElement, OtherTag> const& other)
+    : base_type(other)
+  {}
+
+  /*! Construct a \p tagged_reference that refers to an object pointed to by
+   *  the given \p pointer. After this \p tagged_reference is constructed, it
+   *  shall refer to the object pointed to by \p ptr.
+   *
+   *  \param ptr A \p pointer to construct from.
+   */
+  __host__ __device__ explicit tagged_reference(pointer const& p)
+    : base_type(p)
+  {}
+
+  /*! Assign the object referred to \p other to the object referred to by
+   *  this \p tagged_reference.
+   *
+   *  \param other The other \p tagged_reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  tagged_reference& operator=(tagged_reference const& other)
+  {
+    return base_type::operator=(other);
+  }
+
+  /*! Assign the object referred to by this \p tagged_reference with the object
+   *  referred to by another \p tagged_reference whose pointer type is
+   *  convertible to \p pointer.
+   *
+   *  \tparam OtherElement The element type of the other \p tagged_reference.
+   *  \tparam OtherTag     The tag type of the other \p tagged_reference.
+   *  \param  other        The other \p tagged_reference to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  template <typename OtherElement, typename OtherTag>
+  __host__ __device__
+  tagged_reference&
+  operator=(tagged_reference<OtherElement, OtherTag> const& other)
+  {
+    return base_type::operator=(other);
+  }
+
+  /*! Assign \p rhs to the object referred to by this \p tagged_reference.
+   *
+   *  \param rhs The \p value_type to assign from.
+   *
+   *  \return <tt>*this</tt>.
+   */
+  __host__ __device__
+  tagged_reference& operator=(value_type const& rhs)
+  {
+    return base_type::operator=(rhs);
+  }
+};
+
+template <typename Tag>
+class tagged_reference<void, Tag> {};
+
+template <typename Tag>
+class tagged_reference<void const, Tag> {};
+
+/*! Exchanges the values of two objects referred to by \p tagged_reference.
+ *
+ *  \param x The first \p tagged_reference of interest.
+ *  \param y The second \p tagged_reference of interest.
+ */
+template <typename Element, typename Tag>
+__host__ __device__
+void swap(tagged_reference<Element, Tag>& x, tagged_reference<Element, Tag>& y)
+{
+  x.swap(y);
+}
 
-#include <thrust/detail/reference.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reference.inl b/thrust/detail/reference.inl
deleted file mode 100644
index 2d334defe..000000000
--- a/thrust/detail/reference.inl
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/reference.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/detail/generic/select_system.h>
-#include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/adl/get_value.h>
-#include <thrust/system/detail/adl/assign_value.h>
-#include <thrust/system/detail/adl/iter_swap.h>
-
-
-namespace thrust
-{
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    reference<Element,Pointer,Derived>
-      ::reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-                  typename thrust::detail::enable_if_convertible<
-                    typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                    pointer
-                  >::type *)
-        : m_ptr(other.m_ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  reference<Element,Pointer,Derived>
-    ::reference(const pointer &ptr)
-      : m_ptr(ptr)
-{}
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::pointer
-    reference<Element,Pointer,Derived>
-      ::operator&() const
-{
-  return m_ptr;
-} // end reference::operator&()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const value_type &v)
-{
-  assign_from(&v);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator=(const reference &other)
-{
-  assign_from(&other); 
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    typename reference<Element,Pointer,Derived>::derived_type &
-      reference<Element,Pointer,Derived>
-        ::operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other)
-{
-  assign_from(&other);
-  return static_cast<derived_type&>(*this);
-} // end reference::operator=()
-
-
-__thrust_exec_check_disable__
-template<typename Element, typename Pointer, typename Derived>
-  reference<Element,Pointer,Derived>
-    ::operator typename reference<Element,Pointer,Derived>::value_type () const
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  System system;
-  using thrust::system::detail::generic::select_system;
-  return strip_const_get_value(select_system(system));
-} // end reference::operator value_type ()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System>
-    typename reference<Element,Pointer,Derived>::value_type
-      reference<Element,Pointer,Derived>
-        ::strip_const_get_value(const System &system) const
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::get_value;
-
-  return get_value(thrust::detail::derived_cast(non_const_system), m_ptr);
-} // end reference::strip_const_get_value()
-
-
-__thrust_exec_check_disable__
-template<typename Element, typename Pointer, typename Derived>
-  template<typename OtherPointer>
-    void reference<Element,Pointer,Derived>
-      ::assign_from(OtherPointer src)
-{
-  typedef typename thrust::iterator_system<pointer>::type      System1;
-  typedef typename thrust::iterator_system<OtherPointer>::type System2;
-
-  System1 system1;
-  System2 system2;
-
-  using thrust::system::detail::generic::select_system;
-
-  strip_const_assign_value(select_system(system1, system2), src);
-} // end assign_from()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  template<typename System, typename OtherPointer>
-    void reference<Element,Pointer,Derived>
-      ::strip_const_assign_value(const System &system, OtherPointer src)
-{
-  System &non_const_system = const_cast<System&>(system);
-
-  using thrust::system::detail::generic::assign_value;
-
-  assign_value(thrust::detail::derived_cast(non_const_system), m_ptr, src);
-} // end strip_const_assign_value()
-
-
-__thrust_exec_check_disable__
-template<typename Element, typename Pointer, typename Derived>
-  void reference<Element,Pointer,Derived>
-    ::swap(derived_type &other)
-{
-  typedef typename thrust::iterator_system<pointer>::type System;
-
-  System system;
-
-  using thrust::system::detail::generic::select_system;
-  using thrust::system::detail::generic::iter_swap;
-
-  iter_swap(select_system(system, system), m_ptr, other.m_ptr);
-} // end reference::swap()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator++(void)
-{
-  value_type temp = *this;
-  ++temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator++(int)
-{
-  value_type temp = *this;
-  value_type result = temp++;
-  *this = temp;
-  return result;
-} // end reference::operator++()
-
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator+=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp += rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator+=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator--(void)
-{
-  value_type temp = *this;
-  --temp;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::value_type
-    reference<Element,Pointer,Derived>
-      ::operator--(int)
-{
-  value_type temp = *this;
-  value_type result = temp--;
-  *this = temp;
-  return result;
-} // end reference::operator--()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator-=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp -= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator-=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator*=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp *= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator*=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator/=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp /= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator/=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator%=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp %= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator%=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator<<=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp <<= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator<<=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator>>=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp >>= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator>>=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator&=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp &= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator&=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator|=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp |= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator|=()
-
-template<typename Element, typename Pointer, typename Derived>
-  typename reference<Element,Pointer,Derived>::derived_type &
-    reference<Element,Pointer,Derived>
-      ::operator^=(const value_type &rhs)
-{
-  value_type temp = *this;
-  temp ^= rhs;
-  *this = temp;
-  return static_cast<derived_type&>(*this);
-} // end reference::operator^=()
-
-template<typename Element, typename Pointer, typename Derived,
-         typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os,
-           const reference<Element, Pointer, Derived> &y) {
-  typedef typename reference<Element, Pointer, Derived>::value_type value_type;
-  return os << static_cast<value_type>(y);
-} // end operator<<()
-
-} // end thrust
diff --git a/thrust/detail/reference_forward_declaration.h b/thrust/detail/reference_forward_declaration.h
index a8912ca43..6f2b99949 100644
--- a/thrust/detail/reference_forward_declaration.h
+++ b/thrust/detail/reference_forward_declaration.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -19,10 +19,10 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/use_default.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-template<typename Element, typename Pointer, typename Derived = use_default> class reference;
+template <typename Element, typename Pointer, typename Derived = use_default>
+class reference;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/remove.inl b/thrust/detail/remove.inl
index f5951fa91..7ccc0cc46 100644
--- a/thrust/detail/remove.inl
+++ b/thrust/detail/remove.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/remove.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/remove.h>
 #include <thrust/system/detail/adl/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -246,5 +242,5 @@ template<typename InputIterator1,
 } // end remove_copy_if()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/replace.inl b/thrust/detail/replace.inl
index de5bff4d5..629287bee 100644
--- a/thrust/detail/replace.inl
+++ b/thrust/detail/replace.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file replace.inl
- *  \brief Inline file for replace.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/replace.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/replace.h>
 #include <thrust/system/detail/adl/replace.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -218,5 +214,5 @@ template<typename ForwardIterator, typename T>
 } // end replace()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/reverse.inl b/thrust/detail/reverse.inl
index e8a018cd6..dc316d18f 100644
--- a/thrust/detail/reverse.inl
+++ b/thrust/detail/reverse.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file reverse.inl
- *  \brief Inline file for reverse.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/reverse.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/reverse.h>
 #include <thrust/system/detail/adl/reverse.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -87,5 +83,5 @@ template<typename BidirectionalIterator,
 } // end reverse_copy()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/scan.inl b/thrust/detail/scan.inl
index 5329d1118..b781b0e28 100644
--- a/thrust/detail/scan.inl
+++ b/thrust/detail/scan.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file scan.inl
- *  \brief Inline file for scan.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/scan.h>
@@ -28,8 +25,7 @@
 #include <thrust/system/detail/adl/scan.h>
 #include <thrust/system/detail/adl/scan_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -44,7 +40,7 @@ __host__ __device__
 {
   using thrust::system::detail::generic::inclusive_scan;
   return inclusive_scan(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, result);
-} // end inclusive_scan() 
+} // end inclusive_scan()
 
 
 __thrust_exec_check_disable__
@@ -522,5 +518,5 @@ template<typename InputIterator1,
 }
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/scatter.inl b/thrust/detail/scatter.inl
index 50ca8f3aa..30dd611d1 100644
--- a/thrust/detail/scatter.inl
+++ b/thrust/detail/scatter.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file scatter.inl
- *  \brief Inline file for scatter.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/scatter.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/scatter.h>
 #include <thrust/system/detail/adl/scatter.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -96,9 +94,9 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System3;
 
   System1 system1;
   System2 system2;
@@ -120,10 +118,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4;
 
   System1 system1;
   System2 system2;
@@ -148,10 +146,10 @@ template<typename InputIterator1,
 {
   using thrust::system::detail::generic::select_system;
 
-  typedef typename thrust::iterator_system<InputIterator1>::type       System1; 
-  typedef typename thrust::iterator_system<InputIterator2>::type       System2; 
-  typedef typename thrust::iterator_system<InputIterator3>::type       System3; 
-  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4; 
+  typedef typename thrust::iterator_system<InputIterator1>::type       System1;
+  typedef typename thrust::iterator_system<InputIterator2>::type       System2;
+  typedef typename thrust::iterator_system<InputIterator3>::type       System3;
+  typedef typename thrust::iterator_system<RandomAccessIterator>::type System4;
 
   System1 system1;
   System2 system2;
@@ -161,6 +159,5 @@ template<typename InputIterator1,
   return thrust::scatter_if(select_system(system1,system2,system3,system4), first, last, map, stencil, output, pred);
 } // end scatter_if()
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/select_system.h b/thrust/detail/select_system.h
new file mode 100644
index 000000000..968446162
--- /dev/null
+++ b/thrust/detail/select_system.h
@@ -0,0 +1,84 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/system/detail/generic/select_system.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace detail
+{
+
+// We need a way to compute the return type of `select_system`, which is found
+// by using `thrust::system::detail::generic::select_system` and then making an
+// ADL call. We have no trait that defines the return type. With the
+// limitations of C++11 return type deduction, we need to be able to stick all
+// of that into `decltype`. So, we put the using statement into a detail
+// namespace, and then implement the generic dispatch function in that
+// namespace.
+
+namespace select_system_detail
+{
+
+using thrust::system::detail::generic::select_system;
+
+struct select_system_fn final
+{
+  __thrust_exec_check_disable__
+  template <typename DerivedPolicy0>
+  __host__ __device__
+  auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy0> const& exec0
+  ) const
+  THRUST_DECLTYPE_RETURNS(
+    select_system(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec0))
+    )
+  )
+
+  __thrust_exec_check_disable__
+  template <typename DerivedPolicy0, typename DerivedPolicy1>
+  __host__ __device__
+  auto operator()(
+    thrust::detail::execution_policy_base<DerivedPolicy0> const& exec0
+  , thrust::detail::execution_policy_base<DerivedPolicy1> const& exec1
+  ) const
+  THRUST_DECLTYPE_RETURNS(
+    select_system(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec0))
+    , thrust::detail::derived_cast(thrust::detail::strip_const(exec1))
+    )
+  )
+};
+
+} // namespace select_system_detail
+
+THRUST_INLINE_CONSTANT select_system_detail::select_system_fn select_system{};
+
+} // detail
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/detail/seq.h b/thrust/detail/seq.h
index d1684989a..ba18c2dbf 100644
--- a/thrust/detail/seq.h
+++ b/thrust/detail/seq.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,20 +17,20 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
 
-struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>
+struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::detail::sequential::execution_policy>
 {
   __host__ __device__
-  seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
+  constexpr seq_t() : thrust::system::detail::sequential::execution_policy<seq_t>() {}
 
   // allow any execution_policy to convert to seq_t
   template<typename DerivedPolicy>
@@ -38,26 +38,15 @@ struct seq_t : thrust::system::detail::sequential::execution_policy<seq_t>
   seq_t(const thrust::execution_policy<DerivedPolicy> &)
     : thrust::system::detail::sequential::execution_policy<seq_t>()
   {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::detail::sequential::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::detail::sequential::execution_policy>(alloc);
-  }
 };
 
 
 } // end detail
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ detail::seq_t seq;
-#else
-static const detail::seq_t seq;
-#endif
+THRUST_INLINE_CONSTANT detail::seq_t seq;
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/sequence.inl b/thrust/detail/sequence.inl
index fff7cbb63..ffc9b968b 100644
--- a/thrust/detail/sequence.inl
+++ b/thrust/detail/sequence.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file sequence.inl
- *  \brief Inline file for sequence.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/sequence.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/sequence.h>
 #include <thrust/system/detail/adl/sequence.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -114,5 +110,5 @@ template<typename ForwardIterator, typename T>
 } // end sequence()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/set_operations.inl b/thrust/detail/set_operations.inl
index 42cf5ed35..7915f7b3e 100644
--- a/thrust/detail/set_operations.inl
+++ b/thrust/detail/set_operations.inl
@@ -14,9 +14,7 @@
  *  limitations under the License.
  */
 
-/*! \file set_operations.inl
- *  \brief Inline file for set_operations.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -24,8 +22,7 @@
 #include <thrust/system/detail/generic/set_operations.h>
 #include <thrust/system/detail/adl/set_operations.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -864,5 +861,5 @@ template<typename InputIterator1,
 } // end set_union_by_key()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/shuffle.inl b/thrust/detail/shuffle.inl
new file mode 100644
index 000000000..48f5ba639
--- /dev/null
+++ b/thrust/detail/shuffle.inl
@@ -0,0 +1,83 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/shuffle.h>
+#include <thrust/system/detail/generic/select_system.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+THRUST_NAMESPACE_BEGIN
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g) {
+  using thrust::system::detail::generic::shuffle;
+  return shuffle(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, g);
+}
+
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System;
+  System system;
+
+  return thrust::shuffle(select_system(system), first, last, g);
+}
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result,
+    URBG&& g) {
+  using thrust::system::detail::generic::shuffle_copy;
+  return shuffle_copy(
+      thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+      first, last, result, g);
+}
+
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g) {
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<RandomIterator>::type System1;
+  typedef typename thrust::iterator_system<OutputIterator>::type System2;
+
+  System1 system1;
+  System2 system2;
+
+  return thrust::shuffle_copy(select_system(system1, system2), first, last,
+                              result, g);
+}
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/detail/sort.inl b/thrust/detail/sort.inl
index 2ee9f662f..53f8bad93 100644
--- a/thrust/detail/sort.inl
+++ b/thrust/detail/sort.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file sort.inl
- *  \brief Inline file for sort.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/sort.h>
@@ -26,8 +23,7 @@
 #include <thrust/system/detail/generic/sort.h>
 #include <thrust/system/detail/adl/sort.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -218,6 +214,7 @@ template<typename RandomAccessIterator>
 
 template<typename RandomAccessIterator,
          typename StrictWeakOrdering>
+  __host__ __device__
   void sort(RandomAccessIterator first,
             RandomAccessIterator last,
             StrictWeakOrdering comp)
@@ -243,7 +240,7 @@ template<typename RandomAccessIterator>
   System system;
 
   return thrust::stable_sort(select_system(system), first, last);
-} // end stable_sort() 
+} // end stable_sort()
 
 
 template<typename RandomAccessIterator,
@@ -348,7 +345,7 @@ template<typename ForwardIterator>
                  ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -364,7 +361,7 @@ template<typename ForwardIterator,
                  Compare comp)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -378,7 +375,7 @@ template<typename ForwardIterator>
                                   ForwardIterator last)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -394,7 +391,7 @@ template<typename ForwardIterator,
                                   Compare comp)
 {
   using thrust::system::detail::generic::select_system;
-  
+
   typedef typename thrust::iterator_system<ForwardIterator>::type System;
 
   System system;
@@ -403,5 +400,5 @@ template<typename ForwardIterator,
 } // end is_sorted_until()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/static_assert.h b/thrust/detail/static_assert.h
index ca11ef1be..0e6132790 100644
--- a/thrust/detail/static_assert.h
+++ b/thrust/detail/static_assert.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,6 @@
  *  limitations under the License.
  */
 
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-
 /*
  * (C) Copyright John Maddock 2000.
  * 
@@ -28,52 +23,69 @@
  * For more information, see http://www.boost.org
  */
 
-//
-// Helper macro THRUST_JOIN (based on BOOST_JOIN):
-// The following piece of macro magic joins the two
-// arguments together, even when one of the arguments is
-// itself a macro (see 16.3.1 in C++ standard).  The key
-// is that macro expansion of macro arguments does not
-// occur in THRUST_DO_JOIN2 but does in THRUST_DO_JOIN.
-//
-#define THRUST_JOIN( X, Y ) THRUST_DO_JOIN( X, Y )
-#define THRUST_DO_JOIN( X, Y ) THRUST_DO_JOIN2(X,Y)
-#define THRUST_DO_JOIN2( X, Y ) X##Y
-
-namespace thrust
-{
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/preprocessor.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
-// HP aCC cannot deal with missing names for template value parameters
-template <bool x> struct STATIC_ASSERTION_FAILURE;
+template <typename, bool x>
+struct depend_on_instantiation
+{
+  THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT bool value = x;
+};
 
-template <> struct STATIC_ASSERTION_FAILURE<true> { enum { value = 1 }; };
+#if THRUST_CPP_DIALECT >= 2011
 
-// HP aCC cannot deal with missing names for template value parameters
-template<int x> struct static_assert_test{};
+#  if THRUST_CPP_DIALECT >= 2017
+#    define THRUST_STATIC_ASSERT(B)        static_assert(B)
+#  else
+#    define THRUST_STATIC_ASSERT(B)        static_assert(B, "static assertion failed")
+#  endif
+#  define THRUST_STATIC_ASSERT_MSG(B, msg) static_assert(B, msg)
 
-template<typename, bool x>
-  struct depend_on_instantiation
-{
-  static const bool value = x;
-};
+#else // Older than C++11.
+
+// HP aCC cannot deal with missing names for template value parameters.
+template <bool x> struct STATIC_ASSERTION_FAILURE;
 
-} // end detail
+template <> struct STATIC_ASSERTION_FAILURE<true> {};
 
-} // end thrust
+// HP aCC cannot deal with missing names for template value parameters.
+template <int x> struct static_assert_test {};
 
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION >= 40800)
-  // gcc 4.8+ will complain about this typedef being unused unless we annotate it as such
-#  define THRUST_STATIC_ASSERT( B ) \
-   typedef ::thrust::detail::static_assert_test<\
-      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE< (bool)( B ) >)>\
-         THRUST_JOIN(thrust_static_assert_typedef_, __LINE__) __attribute__((unused))
+#if    (  (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC)                  \
+       && (THRUST_GCC_VERSION >= 40800))                                      \
+    || (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG)
+  // Clang and GCC 4.8+ will complain about this typedef being unused unless we
+  // annotate it as such.
+#  define THRUST_STATIC_ASSERT(B)                                             \
+    typedef THRUST_NS_QUALIFIER::detail::static_assert_test<                  \
+      sizeof(THRUST_NS_QUALIFIER::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)\
+    >                                                                         \
+      THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
+      __attribute__((unused))                                                 \
+    /**/      
 #else
-#  define THRUST_STATIC_ASSERT( B ) \
-   typedef ::thrust::detail::static_assert_test<\
-      sizeof(::thrust::detail::STATIC_ASSERTION_FAILURE< (bool)( B ) >)>\
-         THRUST_JOIN(thrust_static_assert_typedef_, __LINE__)
-#endif // gcc 4.8+
+#  define THRUST_STATIC_ASSERT(B)                                             \
+    typedef THRUST_NS_QUALIFIER::detail::static_assert_test<                  \
+      sizeof(THRUST_NS_QUALIFIER::detail::STATIC_ASSERTION_FAILURE<(bool)(B)>)\
+    >                                                                         \
+      THRUST_PP_CAT2(thrust_static_assert_typedef_, __LINE__)                 \
+    /**/      
+#endif
+
+#define THRUST_STATIC_ASSERT_MSG(B, msg) THRUST_STATIC_ASSERT(B)
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
+
 
diff --git a/thrust/detail/static_map.h b/thrust/detail/static_map.h
index 872a73aef..9f0d79e83 100644
--- a/thrust/detail/static_map.h
+++ b/thrust/detail/static_map.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace static_map_detail
@@ -166,5 +165,5 @@ unsigned int lookup(unsigned int key)
 
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/swap.h b/thrust/detail/swap.h
index 96783c762..305750f8a 100644
--- a/thrust/detail/swap.h
+++ b/thrust/detail/swap.h
@@ -19,8 +19,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 __thrust_exec_check_disable__
 template<typename Assignable1, typename Assignable2>
@@ -32,5 +31,5 @@ inline void swap(Assignable1 &a, Assignable2 &b)
   b = temp;
 } // end swap()
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/swap.inl b/thrust/detail/swap.inl
index 9364ef8ad..196c34f41 100644
--- a/thrust/detail/swap.inl
+++ b/thrust/detail/swap.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/swap.h>
 #include <thrust/detail/swap.h>
diff --git a/thrust/detail/swap_ranges.inl b/thrust/detail/swap_ranges.inl
index 8ed97cc74..1f35c1ff3 100644
--- a/thrust/detail/swap_ranges.inl
+++ b/thrust/detail/swap_ranges.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file swap_ranges.inl
- *  \brief Inline file for swap_ranges.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/swap.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/swap_ranges.h>
 #include <thrust/system/detail/adl/swap_ranges.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -62,5 +60,5 @@ template<typename ForwardIterator1,
 } // end swap_ranges()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tabulate.inl b/thrust/detail/tabulate.inl
index f6385234e..308be061f 100644
--- a/thrust/detail/tabulate.inl
+++ b/thrust/detail/tabulate.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/tabulate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/system/detail/generic/tabulate.h>
 #include <thrust/system/detail/adl/tabulate.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -53,5 +54,5 @@ template<typename ForwardIterator, typename UnaryOperation>
 } // end tabulate()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/temporary_array.h b/thrust/detail/temporary_array.h
index 535842263..cf4bc7d2d 100644
--- a/thrust/detail/temporary_array.h
+++ b/thrust/detail/temporary_array.h
@@ -20,16 +20,29 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+
+// Forward declare temporary_array, as it's used by the CUDA copy backend, which
+// is included in contiguous_storage's definition.
+template<typename T, typename System>
+  class temporary_array;
+
+} // end detail
+THRUST_NAMESPACE_END
+
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/detail/tagged_iterator.h>
 #include <thrust/detail/contiguous_storage.h>
 #include <thrust/detail/allocator/temporary_allocator.h>
 #include <thrust/detail/allocator/no_throw_allocator.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -162,7 +175,7 @@ template<typename Iterator, typename FromSystem, typename ToSystem>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/temporary_array.inl>
 
diff --git a/thrust/detail/temporary_array.inl b/thrust/detail/temporary_array.inl
index e730966c0..90b7279ac 100644
--- a/thrust/detail/temporary_array.inl
+++ b/thrust/detail/temporary_array.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,14 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/temporary_array.h>
 #include <thrust/distance.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/detail/type_traits.h>
 
-
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -162,5 +164,5 @@ __host__ __device__
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/temporary_buffer.h b/thrust/detail/temporary_buffer.h
index d27693ebc..be95e7180 100644
--- a/thrust/detail/temporary_buffer.h
+++ b/thrust/detail/temporary_buffer.h
@@ -21,15 +21,13 @@
 #include <thrust/pair.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/execute_with_allocator.h>
 #include <thrust/system/detail/generic/temporary_buffer.h>
 #include <thrust/system/detail/adl/temporary_buffer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
-namespace get_temporary_buffer_detail
-{
 
 
 template<typename T, typename DerivedPolicy, typename Pair>
@@ -45,7 +43,6 @@ __host__ __device__
 } // end down_cast_pair()
 
 
-} // end get_temporary_buffer_detail
 } // end detail
 
 
@@ -55,22 +52,24 @@ __host__ __device__
   thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
     get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n)
 {
+  using thrust::detail::get_temporary_buffer; // execute_with_allocator
   using thrust::system::detail::generic::get_temporary_buffer;
 
-  return thrust::detail::get_temporary_buffer_detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
+  return thrust::detail::down_cast_pair<T,DerivedPolicy>(get_temporary_buffer<T>(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), n));
 } // end get_temporary_buffer()
 
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
-  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p)
+  void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n)
 {
+  using thrust::detail::return_temporary_buffer; // execute_with_allocator
   using thrust::system::detail::generic::return_temporary_buffer;
 
-  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+  return return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p, n);
 } // end return_temporary_buffer()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform.inl b/thrust/detail/transform.inl
index c27e4de27..62bafd35e 100644
--- a/thrust/detail/transform.inl
+++ b/thrust/detail/transform.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file transform.inl
- *  \brief Inline file for transform.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/transform.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/transform.h>
 #include <thrust/system/detail/adl/transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -245,5 +243,5 @@ template<typename InputIterator1,
 } // end transform_if()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform_reduce.inl b/thrust/detail/transform_reduce.inl
index 571b0e79b..702dd9f73 100644
--- a/thrust/detail/transform_reduce.inl
+++ b/thrust/detail/transform_reduce.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file transform_reduce.inl
- *  \brief Inline file for transform_reduce.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,14 +22,13 @@
 #include <thrust/system/detail/generic/transform_reduce.h>
 #include <thrust/system/detail/adl/transform_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
 template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
+         typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
 __host__ __device__
@@ -48,8 +44,8 @@ __host__ __device__
 } // end transform_reduce()
 
 
-template<typename InputIterator, 
-         typename UnaryFunction, 
+template<typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
   OutputType transform_reduce(InputIterator first,
@@ -68,5 +64,5 @@ template<typename InputIterator,
 } // end transform_reduce()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/transform_scan.inl b/thrust/detail/transform_scan.inl
index d6a488b0a..957001cef 100644
--- a/thrust/detail/transform_scan.inl
+++ b/thrust/detail/transform_scan.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file transform_scan.inl
- *  \brief Inline file for transform_scan.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/scan.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/transform_scan.h>
 #include <thrust/system/detail/adl/transform_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -115,5 +113,5 @@ template<typename InputIterator,
 } // end transform_exclusive_scan()
 
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/trivial_sequence.h b/thrust/detail/trivial_sequence.h
index 03bfe37b6..2cf98e787 100644
--- a/thrust/detail/trivial_sequence.h
+++ b/thrust/detail/trivial_sequence.h
@@ -23,13 +23,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/detail/temporary_array.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -47,7 +49,6 @@ struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::true_type>
     __host__ __device__
     _trivial_sequence(thrust::execution_policy<DerivedPolicy> &, Iterator _first, Iterator _last) : first(_first), last(_last)
     {
-//        std::cout << "trivial case" << std::endl;
     }
 
     __host__ __device__
@@ -70,7 +71,6 @@ struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::false_type>
     _trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last)
       : buffer(exec, first, last)
     {
-//        std::cout << "non-trivial case" << std::endl;
     }
 
     __host__ __device__
@@ -82,9 +82,9 @@ struct _trivial_sequence<Iterator, DerivedPolicy, thrust::detail::false_type>
 
 template <typename Iterator, typename DerivedPolicy>
 struct trivial_sequence
-  : detail::_trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type>
+  : detail::_trivial_sequence<Iterator, DerivedPolicy, typename thrust::is_contiguous_iterator<Iterator>::type>
 {
-    typedef _trivial_sequence<Iterator, DerivedPolicy, typename thrust::detail::is_trivial_iterator<Iterator>::type> super_t;
+    typedef _trivial_sequence<Iterator, DerivedPolicy, typename thrust::is_contiguous_iterator<Iterator>::type> super_t;
 
     __host__ __device__
     trivial_sequence(thrust::execution_policy<DerivedPolicy> &exec, Iterator first, Iterator last) : super_t(exec, first, last) { }
@@ -92,5 +92,5 @@ struct trivial_sequence
 
 } // end namespace detail
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple.inl b/thrust/detail/tuple.inl
index 5602dbd51..f4930bf4b 100644
--- a/thrust/detail/tuple.inl
+++ b/thrust/detail/tuple.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/swap.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // define null_type
 struct null_type {};
@@ -50,38 +53,79 @@ template <
   class T9 = null_type>
 class tuple;
 
-// forward declaration of tuple_element
-template<int i, typename T> struct tuple_element;
 
-// specializations for tuple_element
-template<class T>
-  struct tuple_element<0,T>
-{
-  typedef typename T::head_type type;
-}; // end tuple_element<0,T>
+template <size_t N, class T> struct tuple_element;
 
-template<int N, class T>
-  struct tuple_element<N, const T>
+template<size_t N, class T>
+  struct tuple_element_impl
 {
   private:
     typedef typename T::tail_type Next;
-    typedef typename tuple_element<N-1, Next>::type unqualified_type;
 
   public:
-    typedef typename thrust::detail::add_const<unqualified_type>::type type;
-}; // end tuple_element<N, const T>
+    /*! The result of this metafunction is returned in \c type.
+     */
+    typedef typename tuple_element_impl<N-1, Next>::type type;
+}; // end tuple_element
 
 template<class T>
-  struct tuple_element<0,const T>
+  struct tuple_element_impl<0,T>
+{
+  typedef typename T::head_type type;
+};
+
+template <size_t N, class T>
+  struct tuple_element<N, T const>
 {
-  typedef typename thrust::detail::add_const<typename T::head_type>::type type;
-}; // end tuple_element<0,const T>
+    using type = typename std::add_const<typename tuple_element<N, T>::type>::type;
+};
 
+template <size_t N, class T>
+struct tuple_element<N, T volatile>
+{
+    using type = typename std::add_volatile<typename tuple_element<N, T>::type>::type;
+};
 
+template <size_t N, class T>
+  struct tuple_element<N, T const volatile>
+{
+    using type = typename std::add_cv<typename tuple_element<N, T>::type>::type;
+};
+
+template <size_t N, class T>
+struct tuple_element{
+    using type = typename tuple_element_impl<N,T>::type;
+};
 
 // forward declaration of tuple_size
 template<class T> struct tuple_size;
 
+template<class T>
+  struct tuple_size<T const> : public tuple_size<T> {};
+
+template<class T>
+  struct tuple_size<T volatile> : public tuple_size<T> {};
+
+template<class T>
+  struct tuple_size<T const volatile> : public tuple_size<T> {};
+
+/*! This metafunction returns the number of elements
+ *  of a \p tuple type of interest.
+ *
+ *  \tparam T A \c tuple type of interest.
+ *
+ *  \see pair
+ *  \see tuple
+ */
+template<class T>
+  struct tuple_size
+{
+  /*! The result of this metafunction is returned in \c value.
+   */
+  static const int value = 1 + tuple_size<typename T::tail_type>::value;
+}; // end tuple_size
+
+
 // specializations for tuple_size
 template<>
   struct tuple_size< tuple<> >
@@ -169,7 +213,7 @@ struct get_class
     // XXX we may not need to deal with this for any compiler we care about -jph
     //return get_class<N-1>::BOOST_NESTED_TEMPLATE get<RET>(t.tail);
     return get_class<N-1>::template get<RET>(t.tail);
-    
+
     // gcc 4.3 couldn't compile this:
     //return get_class<N-1>::get<RET>(t.tail);
   }
@@ -309,6 +353,11 @@ template <class HT, class TT>
   inline __host__ __device__
   cons( const cons<HT2, TT2>& u ) : head(u.head), tail(u.tail) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
+  __thrust_exec_check_disable__
   template <class HT2, class TT2>
   inline __host__ __device__
   cons& operator=( const cons<HT2, TT2>& u ) {
@@ -317,6 +366,7 @@ template <class HT, class TT>
 
   // must define assignment operator explicitly, implicit version is
   // illformed if HT is a reference (12.8. (12))
+  __thrust_exec_check_disable__
   inline __host__ __device__
   cons& operator=(const cons& u) {
     head = u.head; tail = u.tail;  return *this;
@@ -410,6 +460,11 @@ template <class HT>
   inline __host__ __device__
   cons( const cons<HT2, null_type>& u ) : head(u.head) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  cons(const cons &) = default;
+#endif
+
+  __thrust_exec_check_disable__
   template <class HT2>
   inline __host__ __device__
   cons& operator=(const cons<HT2, null_type>& u )
@@ -587,7 +642,7 @@ inline typename access_traits<
 get(detail::cons<HT, TT>& c)
 {
   //return detail::get_class<N>::BOOST_NESTED_TEMPLATE
-  
+
   // gcc 4.3 couldn't compile this:
   //return detail::get_class<N>::
 
@@ -810,6 +865,7 @@ inline bool eq(const T1& lhs, const T2& rhs) {
          eq(lhs.get_tail(), rhs.get_tail());
 }
 template<>
+__host__ __device__
 inline bool eq<null_type,null_type>(const null_type&, const null_type&) { return true; }
 
 template<class T1, class T2>
@@ -944,5 +1000,5 @@ inline bool operator>=(const detail::cons<T1, T2>& lhs, const detail::cons<S1, S
   return detail::gte(lhs, rhs);
 } // end operator>=()
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple_algorithms.h b/thrust/detail/tuple_algorithms.h
new file mode 100644
index 000000000..2e49f4281
--- /dev/null
+++ b/thrust/detail/tuple_algorithms.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/integer_sequence.h>
+
+#include <tuple>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename Tuple, std::size_t... Is>
+auto tuple_subset(Tuple&& t, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(std::make_tuple(std::get<Is>(THRUST_FWD(t))...));
+
+namespace detail
+{
+
+template <typename Tuple, typename F, std::size_t... Is>
+void tuple_for_each_impl(Tuple&& t, F&& f, index_sequence<Is...>)
+{
+  auto l = { (f(std::get<Is>(t)), 0)... };
+  THRUST_UNUSED_VAR(l);
+}
+
+template <typename Tuple, typename F, std::size_t... Is>
+auto tuple_transform_impl(Tuple&& t, F&& f, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(std::make_tuple(f(std::get<Is>(t))...));
+
+} // namespace detail
+
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...>& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...> const& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_for_each(std::tuple<Ts...>&& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_for_each_impl(
+    std::move(t)
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...>& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...> const& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    t
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+template <typename... Ts, typename F>
+auto tuple_transform(std::tuple<Ts...>&& t, F&& f)
+THRUST_DECLTYPE_RETURNS(
+  detail::tuple_transform_impl(
+    std::move(t)
+  , THRUST_FWD(f)
+  , make_index_sequence<sizeof...(Ts)>{}
+  )
+);
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/detail/tuple_meta_transform.h b/thrust/detail/tuple_meta_transform.h
index 4aca1a91b..285cae8b4 100644
--- a/thrust/detail/tuple_meta_transform.h
+++ b/thrust/detail/tuple_meta_transform.h
@@ -16,162 +16,43 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
+// introduce an intermediate type tuple_meta_transform_WAR_NVCC
+// rather than directly specializing tuple_meta_transform with
+// default argument IndexSequence = thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>
+// to workaround nvcc 11.0 compiler bug
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_meta_transform;
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
-{
-  typedef null_type type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
-  > type;
-};
+         typename IndexSequence>
+  struct tuple_meta_transform_WAR_NVCC;
 
 template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
+         template<typename> class UnaryMetaFunction,
+         size_t... Is>
+  struct tuple_meta_transform_WAR_NVCC<Tuple, UnaryMetaFunction, thrust::index_sequence<Is...>>
 {
   typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
+    typename UnaryMetaFunction<typename thrust::tuple_element<Is,Tuple>::type>::type...
   > type;
 };
 
 template<typename Tuple,
          template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
+  struct tuple_meta_transform
 {
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
-  > type;
+  typedef typename tuple_meta_transform_WAR_NVCC<Tuple, UnaryMetaFunction, thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>>::type type;
 };
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/tuple_transform.h b/thrust/detail/tuple_transform.h
index 1f53e2fde..1011d5179 100644
--- a/thrust/detail/tuple_transform.h
+++ b/thrust/detail/tuple_transform.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/tuple.h>
 #include <thrust/detail/tuple_meta_transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -28,332 +29,15 @@ namespace detail
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
          typename UnaryFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
+         typename IndexSequence = thrust::make_index_sequence<thrust::tuple_size<Tuple>::value>>
   struct tuple_transform_functor;
 
 
 template<typename Tuple,
          template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::null_type();
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::null_type();
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
+         typename UnaryFunction,
+         size_t... Is>
+  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,thrust::index_sequence<Is...>>
 {
   static __host__
   typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
@@ -361,16 +45,7 @@ template<typename Tuple,
   {
     typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
 
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
+    return XfrmTuple(f(thrust::get<Is>(t))...);
   }
 
   static __host__ __device__
@@ -379,16 +54,7 @@ template<typename Tuple,
   {
     typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
 
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
+    return XfrmTuple(f(thrust::get<Is>(t))...);
   }
 };
 
@@ -414,5 +80,5 @@ tuple_host_device_transform(const Tuple &t, UnaryFunction f)
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_deduction.h b/thrust/detail/type_deduction.h
new file mode 100644
index 000000000..6f240711d
--- /dev/null
+++ b/thrust/detail/type_deduction.h
@@ -0,0 +1,90 @@
+// Copyright (c)      2018 NVIDIA Corporation
+//                         (Bryce Adelstein Lelbach <brycelelbach@gmail.com>)
+// Copyright (c) 2013-2018 Eric Niebler (`THRUST_RETURNS`, etc)
+// Copyright (c) 2016-2018 Casey Carter (`THRUST_RETURNS`, etc)
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/preprocessor.h>
+
+#include <utility>
+#include <type_traits>
+
+///////////////////////////////////////////////////////////////////////////////
+
+/// \def THRUST_FWD(x)
+/// \brief Performs universal forwarding of a universal reference.
+///
+#define THRUST_FWD(x) ::std::forward<decltype(x)>(x)
+
+/// \def THRUST_MVCAP(x)
+/// \brief Capture `x` into a lambda by moving.
+///
+#define THRUST_MVCAP(x) x = ::std::move(x)
+
+/// \def THRUST_RETOF(invocable, ...)
+/// \brief Expands to the type returned by invoking an instance of the invocable
+///        type \a invocable with parameters of type \c __VA_ARGS__. Must
+///        be called with 1 or fewer parameters to the invocable.
+///
+#define THRUST_RETOF(...)   THRUST_PP_DISPATCH(THRUST_RETOF, __VA_ARGS__)
+#define THRUST_RETOF1(C)    decltype(::std::declval<C>()())
+#define THRUST_RETOF2(C, V) decltype(::std::declval<C>()(::std::declval<V>()))
+
+/// \def THRUST_RETURNS(...)
+/// \brief Expands to a function definition that returns the expression
+///        \c __VA_ARGS__.
+///
+#define THRUST_RETURNS(...)                                                   \
+  noexcept(noexcept(__VA_ARGS__))                                             \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+
+/// \def THRUST_DECLTYPE_RETURNS(...)
+/// \brief Expands to a function definition, including a trailing returning
+///        type, that returns the expression \c __VA_ARGS__.
+///
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+#else
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
+    noexcept(noexcept(__VA_ARGS__))                                           \
+    -> decltype(__VA_ARGS__)                                                  \
+    { return (__VA_ARGS__); }                                                 \
+    /**/
+#endif
+
+/// \def THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)
+/// \brief Expands to a function definition, including a trailing returning
+///        type, that returns the expression \c __VA_ARGS__. It shall only
+///        participate in overload resolution if \c condition is \c true.
+///
+// Trailing return types seem to confuse Doxygen, and cause it to interpret
+// parts of the function's body as new function signatures.
+#if defined(THRUST_DOXYGEN)
+  #define THRUST_DECLTYPE_RETURNS(...)                                        \
+  { return (__VA_ARGS__); }                                                   \
+  /**/
+#else
+  #define THRUST_DECLTYPE_RETURNS_WITH_SFINAE_CONDITION(condition, ...)       \
+    noexcept(noexcept(__VA_ARGS__))                                           \
+    -> typename std::enable_if<condition, decltype(__VA_ARGS__)>::type        \
+    { return (__VA_ARGS__); }                                                 \
+    /**/
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/detail/type_traits.h b/thrust/detail/type_traits.h
index 31df7aaf3..f25eaeaf0 100644
--- a/thrust/detail/type_traits.h
+++ b/thrust/detail/type_traits.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,26 +24,11 @@
 
 #include <thrust/detail/config.h>
 
-// XXX nvcc 2.2 closed beta can't compile type_traits
-//// find type_traits
-//
-//#ifdef __GNUC__
-//
-//#if __GNUC__ == 4 && __GNUC_MINOR__ == 2
-//#include <tr1/type_traits>
-//#elif __GNUC__ == 4 && __GNUC_MINOR__ > 2
-//#include <type_traits>
-//#endif // GCC version
-//
-//#endif // GCC
-//
-//#ifdef _MSC_VER
-//#include <type_traits>
-//#endif // MSVC
+#include <cuda/std/type_traits>
 
+#include <type_traits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of device_reference
 template<typename T> class device_reference;
@@ -51,19 +36,35 @@ template<typename T> class device_reference;
 namespace detail
 {
  /// helper classes [4.3].
- template<typename _Tp, _Tp __v>
+ template<typename T, T v>
    struct integral_constant
    {
-     static const _Tp                      value = __v;
-     typedef _Tp                           value_type;
-     typedef integral_constant<_Tp, __v>   type;
+     THRUST_INLINE_INTEGRAL_MEMBER_CONSTANT T value = v;
+
+     typedef T                       value_type;
+     typedef integral_constant<T, v> type;
+
+     // We don't want to switch to std::integral_constant, because we want access
+     // to the C++14 operator(), but we'd like standard traits to interoperate
+     // with our version when tag dispatching.
+     integral_constant() = default;
+
+     integral_constant(integral_constant const&) = default;
+
+     integral_constant& operator=(integral_constant const&) = default;
+
+     constexpr __host__ __device__
+     integral_constant(std::integral_constant<T, v>) noexcept {}
+
+     constexpr __host__ __device__ operator value_type() const noexcept { return value; }
+     constexpr __host__ __device__ value_type operator()() const noexcept { return value; }
    };
  
  /// typedef for true_type
- typedef integral_constant<bool, true>     true_type;
+ typedef integral_constant<bool, true>  true_type;
 
  /// typedef for true_type
- typedef integral_constant<bool, false>    false_type;
+ typedef integral_constant<bool, false> false_type;
 
 //template<typename T> struct is_integral : public std::tr1::is_integral<T> {};
 template<typename T> struct is_integral                           : public false_type {};
@@ -111,23 +112,23 @@ template<typename T> struct is_void             : public false_type {};
 template<>           struct is_void<void>       : public true_type {};
 template<>           struct is_void<const void> : public true_type {};
 
+template<typename T> struct is_non_bool_integral       : public is_integral<T> {};
+template<>           struct is_non_bool_integral<bool> : public false_type {};
 
-namespace tt_detail
-{
-
-
-} // end tt_detail
+template<typename T> struct is_non_bool_arithmetic       : public is_arithmetic<T> {};
+template<>           struct is_non_bool_arithmetic<bool> : public false_type {};
 
 template<typename T> struct is_pod
    : public integral_constant<
        bool,
        is_void<T>::value || is_pointer<T>::value || is_arithmetic<T>::value
-#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC || \
+    THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_CLANG
 // use intrinsic type traits
        || __is_pod(T)
 #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
 // only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
+#if (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)
        || __is_pod(T)
 #endif // GCC VERSION
 #endif // THRUST_HOST_COMPILER
@@ -135,34 +136,14 @@ template<typename T> struct is_pod
  {};
 
 
-template<typename T> struct has_trivial_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER   == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_constructor(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_constructor(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-      >
+template <typename T> 
+struct has_trivial_constructor
+  : public integral_constant<bool, is_pod<T>::value || ::cuda::std::is_trivially_constructible<T>::value> 
 {};
 
-template<typename T> struct has_trivial_copy_constructor
-  : public integral_constant<
-      bool,
-      is_pod<T>::value
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_copy(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_copy(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
-    >
+template<typename T> 
+struct has_trivial_copy_constructor
+  : public integral_constant<bool, is_pod<T>::value || ::cuda::std::is_trivially_copyable<T>::value>
 {};
 
 template<typename T> struct has_trivial_destructor : public is_pod<T> {};
@@ -225,6 +206,8 @@ template<typename T>
 template<typename T> struct is_reference     : public false_type {};
 template<typename T> struct is_reference<T&> : public true_type {};
 
+template<typename T> struct is_proxy_reference  : public false_type {};
+
 template<typename T> struct is_device_reference                                : public false_type {};
 template<typename T> struct is_device_reference< thrust::device_reference<T> > : public true_type {};
 
@@ -290,6 +273,12 @@ template<typename T1, typename T2>
 {
 }; // end lazy_is_different
 
+#if THRUST_CPP_DIALECT >= 2011
+
+using std::is_convertible;
+
+#else
+
 namespace tt_detail
 {
 
@@ -304,9 +293,8 @@ template<typename T>
 }; // end is_int_or_cref
 
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_BEGIN
-
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_BEGIN
 
 template<typename From, typename To>
   struct is_convertible_sfinae
@@ -324,8 +312,8 @@ template<typename From, typename To>
 }; // end is_convertible_sfinae
 
 
-__THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_END
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_DISABLE_MSVC_FORCING_VALUE_TO_BOOL_WARNING_END
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
 
 
 template<typename From, typename To>
@@ -366,6 +354,7 @@ template<typename From, typename To>
 {
 }; // end is_convertible
 
+#endif
 
 template<typename T1, typename T2>
   struct is_one_convertible_to_the_other
@@ -377,22 +366,44 @@ template<typename T1, typename T2>
 
 
 // mpl stuff
+template<typename... Conditions>
+  struct or_;
 
-template <typename Condition1,               typename Condition2,              typename Condition3 = false_type,
-          typename Condition4  = false_type, typename Condition5 = false_type, typename Condition6 = false_type,
-          typename Condition7  = false_type, typename Condition8 = false_type, typename Condition9 = false_type,
-          typename Condition10 = false_type>
-  struct or_
+template <>
+  struct or_<>
     : public integral_constant<
         bool,
-        Condition1::value || Condition2::value || Condition3::value || Condition4::value || Condition5::value || Condition6::value || Condition7::value || Condition8::value || Condition9::value || Condition10::value
+        false_type::value  // identity for or_
       >
 {
 }; // end or_
 
-template <typename Condition1, typename Condition2, typename Condition3 = true_type>
-  struct and_
-    : public integral_constant<bool, Condition1::value && Condition2::value && Condition3::value>
+template <typename Condition, typename... Conditions>
+  struct or_<Condition, Conditions...>
+    : public integral_constant<
+        bool,
+        Condition::value || or_<Conditions...>::value
+      >
+{
+}; // end or_
+
+template <typename... Conditions>
+  struct and_;
+
+template<>
+  struct and_<>
+    : public integral_constant<
+        bool,
+        true_type::value // identity for and_
+      >
+{
+}; // end and_
+
+template <typename Condition, typename... Conditions>
+  struct and_<Condition, Conditions...>
+    : public integral_constant<
+        bool,
+        Condition::value && and_<Conditions...>::value>
 {
 }; // end and_
 
@@ -402,6 +413,12 @@ template <typename Boolean>
 {
 }; // end not_
 
+template<bool B, class T, class F>
+struct conditional { typedef T type; };
+ 
+template<class T, class F>
+struct conditional<false, T, F> { typedef F type; };
+
 template <bool, typename Then, typename Else>
   struct eval_if
 {
@@ -477,7 +494,7 @@ namespace tt_detail
 template<typename T> struct make_unsigned_simple;
 
 template<> struct make_unsigned_simple<char>                   { typedef unsigned char          type; };
-template<> struct make_unsigned_simple<signed char>            { typedef signed   char          type; };
+template<> struct make_unsigned_simple<signed char>            { typedef unsigned char          type; };
 template<> struct make_unsigned_simple<unsigned char>          { typedef unsigned char          type; };
 template<> struct make_unsigned_simple<short>                  { typedef unsigned short         type; };
 template<> struct make_unsigned_simple<unsigned short>         { typedef unsigned short         type; };
@@ -527,15 +544,7 @@ template<typename T>
 
 struct largest_available_float
 {
-#if defined(__CUDA_ARCH__)
-#  if (__CUDA_ARCH__ < 130)
-  typedef float type;
-#  else
-  typedef double type;
-#  endif
-#else
   typedef double type;
-#endif
 };
 
 // T1 wins if they are both the same size
@@ -548,6 +557,11 @@ template<typename T1, typename T2>
       >
 {};
 
+#if THRUST_CPP_DIALECT >= 2011
+
+using std::is_base_of;
+
+#else
 
 namespace is_base_of_ns
 {
@@ -582,6 +596,8 @@ template<typename Base, typename Derived>
       >
 {};
 
+#endif
+
 template<typename Base, typename Derived, typename Result = void>
   struct enable_if_base_of
     : enable_if<
@@ -602,7 +618,7 @@ template<typename T1, typename T2>
 
   template<typename T> static typename add_reference<T>::type declval();
   
-  template<unsigned int> struct helper { typedef void * type; };
+  template<size_t> struct helper { typedef void * type; };
 
   template<typename U1, typename U2> static yes_type test(typename helper<sizeof(declval<U1>() = declval<U2>())>::type);
 
@@ -640,7 +656,7 @@ template<typename T1, typename T2>
   <typename is_floating_point<T1>::type,typename is_floating_point<T2>::type>
   ::value>::type>
   {
-  typedef larger_type<T1,T2> type;
+  typedef typename larger_type<T1,T2>::type type;
   };
 
 template<typename T1, typename T2> 
@@ -659,9 +675,43 @@ template<typename T1, typename T2>
   typedef T1 type;
   };
 
+template<typename T>
+  struct is_empty_helper : public T
+  {
+  };
+
+struct is_empty_helper_base
+{
+};
+
+template<typename T>
+  struct is_empty : integral_constant<bool,
+    sizeof(is_empty_helper_base) == sizeof(is_empty_helper<T>)
+  >
+  {
+  };
+
+template <typename Invokable, typename... Args>
+using invoke_result_t =
+#if THRUST_CPP_DIALECT < 2017
+  typename ::cuda::std::result_of<Invokable(Args...)>::type;
+#else // 2017+
+  ::cuda::std::invoke_result_t<Invokable, Args...>;
+#endif
+
+template <class F, class... Us> 
+struct invoke_result
+{
+  using type = invoke_result_t<F, Us...>;
+};
+
 } // end detail
 
-} // end thrust
+using detail::integral_constant;
+using detail::true_type;
+using detail::false_type;
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/type_traits/has_trivial_assign.h>
 
diff --git a/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h b/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
deleted file mode 100644
index 73d50a86e..000000000
--- a/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-namespace thrust
-{
-
-namespace detail
-{
-
-// this trait reports what type should be used as a temporary in certain algorithms
-// which aggregate intermediate results from a function before writing to an output iterator
-
-// the pseudocode for deducing the type of the temporary used below:
-// 
-// if Function is an AdaptableFunction
-//   result = Function::result_type
-// else if OutputIterator2 is a "pure" output iterator
-//   result = InputIterator2::value_type
-// else
-//   result = OutputIterator2::value_type
-//
-// XXX upon c++0x, TemporaryType needs to be:
-// result_of<BinaryFunction>::type
-template<typename InputIterator, typename OutputIterator, typename Function>
-  struct intermediate_type_from_function_and_iterators
-    : eval_if<
-        has_result_type<Function>::value,
-        result_type<Function>,
-        eval_if<
-          is_output_iterator<OutputIterator>::value,
-          thrust::iterator_value<InputIterator>,
-          thrust::iterator_value<OutputIterator>
-        >
-      >
-{
-}; // end intermediate_type_from_function_and_iterators
-
-} // end detail
-
-} // end thrust
-
diff --git a/thrust/detail/type_traits/function_traits.h b/thrust/detail/type_traits/function_traits.h
index 0c7775c0d..109820136 100644
--- a/thrust/detail/type_traits/function_traits.h
+++ b/thrust/detail/type_traits/function_traits.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/has_nested_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward definitions for is_commutative
 template <typename T> struct plus;
@@ -92,5 +93,5 @@ template<typename T> struct is_commutative< typename thrust::bit_and<T>     > :
 template<typename T> struct is_commutative< typename thrust::bit_xor<T>     > : public thrust::detail::is_arithmetic<T> {};
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/has_member_function.h b/thrust/detail/type_traits/has_member_function.h
index 03ed61b6d..c33fe28f6 100644
--- a/thrust/detail/type_traits/has_member_function.h
+++ b/thrust/detail/type_traits/has_member_function.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -18,101 +18,21 @@
 
 #include <thrust/detail/type_traits.h>
 
-#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)                                \
-template<typename T, typename Signature> class trait_name;                                                   \
-                                                                                                             \
-template<typename T, typename Result>                                                                        \
-class trait_name<T, Result(void)>                                                                            \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name();                                                                          \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(), &U::member_function_name>* = 0);                    \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg>                                                          \
-class trait_name<T, Result(Arg)>                                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg);                                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg), &U::member_function_name>* = 0);                 \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2>                                          \
-class trait_name<T, Result(Arg1,Arg2)>                                                                       \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2);                                                                 \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2), &U::member_function_name>* = 0);           \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3>                           \
-class trait_name<T, Result(Arg1,Arg2,Arg3)>                                                                  \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3);                                                            \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3), &U::member_function_name>* = 0);      \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           \
-                                                                                                             \
-template<typename T, typename Result, typename Arg1, typename Arg2, typename Arg3, typename Arg4>            \
-class trait_name<T, Result(Arg1,Arg2,Arg3,Arg4)>                                                             \
-{                                                                                                            \
-   class yes { char m; };                                                                                    \
-   class no { yes m[2]; };                                                                                   \
-   struct base_mixin                                                                                         \
-   {                                                                                                         \
-     Result member_function_name(Arg1,Arg2,Arg3,Arg4);                                                       \
-   };                                                                                                        \
-   struct base : public T, public base_mixin {};                                                             \
-   template <typename U, U t>  class helper{};                                                               \
-   template <typename U>                                                                                     \
-   static no deduce(U*, helper<Result (base_mixin::*)(Arg1,Arg2,Arg3,Arg4), &U::member_function_name>* = 0); \
-   static yes deduce(...);                                                                                   \
-public:                                                                                                      \
-   static const bool value = sizeof(yes) == sizeof(deduce(static_cast<base*>(0)));                           \
-   typedef thrust::detail::integral_constant<bool,value> type;                                               \
-};                                                                                                           
+#include <utility> // for std::declval
 
+#define __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name, member_function_name)  \
+  template <typename T, typename Signature, typename = void>                   \
+  struct trait_name : thrust::false_type                                       \
+  {};                                                                          \
+                                                                               \
+  template <typename T, typename ResultT, typename... Args>                    \
+  struct trait_name<T,                                                         \
+                    ResultT(Args...),                                          \
+                    typename thrust::detail::enable_if<                        \
+                      thrust::detail::is_same<ResultT, void>::value ||         \
+                      thrust::detail::is_convertible<                          \
+                        ResultT,                                               \
+                        decltype(std::declval<T>().member_function_name(       \
+                          std::declval<Args>()...))>::value>::type>            \
+      : thrust::true_type                                                      \
+  {};
diff --git a/thrust/detail/type_traits/has_trivial_assign.h b/thrust/detail/type_traits/has_trivial_assign.h
index 15496560d..7222ce593 100644
--- a/thrust/detail/type_traits/has_trivial_assign.h
+++ b/thrust/detail/type_traits/has_trivial_assign.h
@@ -25,28 +25,23 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+#include <cuda/std/type_traits>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
-template<typename T> struct has_trivial_assign
+template<typename T> 
+struct has_trivial_assign
   : public integral_constant<
-      bool,
-      (is_pod<T>::value && !is_const<T>::value)
-#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-      || __has_trivial_assign(T)
-#elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC
-// only use the intrinsic for >= 4.3
-#if (__GNUC__ >= 4) && (__GNUC_MINOR__ >= 3)
-      || __has_trivial_assign(T)
-#endif // GCC VERSION
-#endif // THRUST_HOST_COMPILER
+      bool, 
+      (is_pod<T>::value && !is_const<T>::value) 
+      || ::cuda::std::is_trivially_copy_assignable<T>::value
     >
 {};
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/is_call_possible.h b/thrust/detail/type_traits/is_call_possible.h
index bff049377..58c1aca4d 100644
--- a/thrust/detail/type_traits/is_call_possible.h
+++ b/thrust/detail/type_traits/is_call_possible.h
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/has_member_function.h>
 
 // inspired by Roman Perepelitsa's presentation from comp.lang.c++.moderated
 // based on the implementation here: http://www.rsdn.ru/forum/cpp/2759773.1.aspx
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace is_call_possible_detail
@@ -51,7 +52,7 @@ struct clone_constness<const src_type, dest_type>
 
 } // end is_call_possible_detail
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
 #define __THRUST_DEFINE_IS_CALL_POSSIBLE(trait_name, member_function_name)                                                                \
 __THRUST_DEFINE_HAS_MEMBER_FUNCTION(trait_name##_has_member, member_function_name)                                                        \
diff --git a/thrust/detail/type_traits/is_metafunction_defined.h b/thrust/detail/type_traits/is_metafunction_defined.h
index c278e5bdb..2c7a4be52 100644
--- a/thrust/detail/type_traits/is_metafunction_defined.h
+++ b/thrust/detail/type_traits/is_metafunction_defined.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -37,5 +38,5 @@ template<typename Metafunction>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/iterator/is_discard_iterator.h b/thrust/detail/type_traits/iterator/is_discard_iterator.h
index 0a5900de2..210409d62 100644
--- a/thrust/detail/type_traits/iterator/is_discard_iterator.h
+++ b/thrust/detail/type_traits/iterator/is_discard_iterator.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/discard_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -36,5 +35,5 @@ struct is_discard_iterator< thrust::discard_iterator<System> >
 {};
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/iterator/is_output_iterator.h b/thrust/detail/type_traits/iterator/is_output_iterator.h
index d6801305b..555b67400 100644
--- a/thrust/detail/type_traits/iterator/is_output_iterator.h
+++ b/thrust/detail/type_traits/iterator/is_output_iterator.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/detail/any_assign.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -62,5 +61,5 @@ template<typename T>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/minimum_type.h b/thrust/detail/type_traits/minimum_type.h
index 7e34f4f8a..2417e327d 100644
--- a/thrust/detail/type_traits/minimum_type.h
+++ b/thrust/detail/type_traits/minimum_type.h
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 { 
@@ -56,8 +57,8 @@ struct primitive_minimum_type
   : minimum_type_detail::minimum_type_impl<
       T1,
       T2,
-      ::thrust::detail::is_convertible<T1,T2>::value,
-      ::thrust::detail::is_convertible<T2,T1>::value
+      THRUST_NS_QUALIFIER::detail::is_convertible<T1,T2>::value,
+      THRUST_NS_QUALIFIER::detail::is_convertible<T2,T1>::value
     >
 {
 }; // end primitive_minimum_type
@@ -158,5 +159,5 @@ template<typename T1,  typename T2,  typename T3,  typename T4,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/pointer_traits.h b/thrust/detail/type_traits/pointer_traits.h
index 9efd2464d..90a8bc29d 100644
--- a/thrust/detail/type_traits/pointer_traits.h
+++ b/thrust/detail/type_traits/pointer_traits.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,9 +22,9 @@
 #include <thrust/detail/type_traits/has_nested_type.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <cstddef>
+#include <type_traits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -83,34 +83,58 @@ template<typename Ptr, typename T> struct rebind_pointer;
 template<typename T, typename U>
   struct rebind_pointer<T*,U>
 {
-  typedef U* type;
+  using type = U*;
 };
 
-template<template<typename> class Ptr, typename Arg, typename T>
-  struct rebind_pointer<Ptr<Arg>,T>
+// Rebind generic fancy pointers.
+template<template<typename, typename...> class Ptr, typename OldT, typename... Tail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tail...>,T>
 {
-  typedef Ptr<T> type;
+  using type = Ptr<T,Tail...>;
 };
 
-template<template<typename, typename> class Ptr, typename Arg1, typename Arg2, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2>,T>
+// Rebind `thrust::pointer`-like things with `thrust::reference`-like references.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class Ref, typename... RefTail,
+         typename... PtrTail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,Ref<OldT,RefTail...>,PtrTail...>,T>
 {
-  typedef Ptr<T,Arg2> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "0");
+  using type = Ptr<T,Tag,Ref<T,RefTail...>,PtrTail...>;
 };
 
-template<template<typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3>,T>
+// Rebind `thrust::pointer`-like things with `thrust::reference`-like references
+// and templated derived types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class Ref, typename... RefTail,
+         template<typename...> class DerivedPtr, typename... DerivedPtrTail,
+         typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,Ref<OldT,RefTail...>,DerivedPtr<OldT,DerivedPtrTail...>>,T>
 {
-  typedef Ptr<T,Arg2,Arg3> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "1");
+  using type = Ptr<T,Tag,Ref<T,RefTail...>,DerivedPtr<T,DerivedPtrTail...>>;
 };
 
-template<template<typename, typename, typename, typename> class Ptr, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename T>
-  struct rebind_pointer<Ptr<Arg1,Arg2,Arg3,Arg4>,T>
+// Rebind `thrust::pointer`-like things with native reference types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         typename... PtrTail, typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,typename std::add_lvalue_reference<OldT>::type,PtrTail...>,T>
 {
-  typedef Ptr<T,Arg2,Arg3,Arg4> type;
+//  static_assert(std::is_same<OldT, Tag>::value, "2");
+  using type = Ptr<T,Tag,typename std::add_lvalue_reference<T>::type,PtrTail...>;
+};
+
+// Rebind `thrust::pointer`-like things with native reference types and templated
+// derived types.
+template<template<typename, typename, typename, typename...> class Ptr, typename OldT, typename Tag,
+         template<typename...> class DerivedPtr, typename... DerivedPtrTail,
+         typename T>
+  struct rebind_pointer<Ptr<OldT,Tag,typename std::add_lvalue_reference<OldT>::type,DerivedPtr<OldT,DerivedPtrTail...>>,T>
+{
+//  static_assert(std::is_same<OldT, Tag>::value, "3");
+  using type = Ptr<T,Tag,typename std::add_lvalue_reference<T>::type,DerivedPtr<T,DerivedPtrTail...>>;
 };
 
-// XXX this should probably be renamed native_type or similar
 __THRUST_DEFINE_HAS_NESTED_TYPE(has_raw_pointer, raw_pointer)
 
 namespace pointer_traits_detail
@@ -174,11 +198,12 @@ template<typename Ptr>
   struct pointer_traits
 {
   typedef Ptr                                    pointer;
+  typedef typename Ptr::reference                reference;
   typedef typename pointer_element<Ptr>::type    element_type;
   typedef typename pointer_difference<Ptr>::type difference_type;
 
   template<typename U>
-    struct rebind 
+    struct rebind
   {
     typedef typename rebind_pointer<Ptr,U>::type other;
   };
@@ -188,7 +213,7 @@ template<typename Ptr>
   {
     // XXX this is supposed to be pointer::pointer_to(&r); (i.e., call a static member function of pointer called pointer_to)
     //     assume that pointer has a constructor from raw pointer instead
-    
+
     return pointer(&r);
   }
 
@@ -206,6 +231,7 @@ template<typename T>
   struct pointer_traits<T*>
 {
   typedef T*                                    pointer;
+  typedef T&                                    reference;
   typedef T                                     element_type;
   typedef typename pointer_difference<T*>::type difference_type;
 
@@ -231,6 +257,74 @@ template<typename T>
   }
 };
 
+template<>
+  struct pointer_traits<void*>
+{
+  typedef void*                                    pointer;
+  typedef void                                     reference;
+  typedef void                                     element_type;
+  typedef pointer_difference<void*>::type          difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef pointer_raw_pointer<void*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<>
+  struct pointer_traits<const void*>
+{
+  typedef const void*                           pointer;
+  typedef const void                            reference;
+  typedef const void                            element_type;
+  typedef pointer_difference<const void*>::type difference_type;
+
+  template<typename U>
+    struct rebind
+  {
+    typedef U* other;
+  };
+
+  __host__ __device__
+  inline static pointer pointer_to(pointer_traits_detail::pointer_to_param<element_type>::type r)
+  {
+    return &r;
+  }
+
+  // thrust additions follow
+  typedef pointer_raw_pointer<const void*>::type raw_pointer;
+
+  __host__ __device__
+  inline static raw_pointer get(pointer ptr)
+  {
+    return ptr;
+  }
+};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_pointer_system_convertible
+    : thrust::detail::is_convertible<
+        typename iterator_system<FromPtr>::type,
+        typename iterator_system<ToPtr>::type
+      >
+{};
+
 template<typename FromPtr, typename ToPtr>
   struct is_pointer_convertible
     : thrust::detail::and_<
@@ -238,10 +332,18 @@ template<typename FromPtr, typename ToPtr>
           typename pointer_element<FromPtr>::type *,
           typename pointer_element<ToPtr>::type *
         >,
-        thrust::detail::is_convertible<
-          typename iterator_system<FromPtr>::type,
-          typename iterator_system<ToPtr>::type
-        >
+        is_pointer_system_convertible<FromPtr, ToPtr>
+      >
+{};
+
+template<typename FromPtr, typename ToPtr>
+  struct is_void_pointer_system_convertible
+    : thrust::detail::and_<
+        thrust::detail::is_same<
+          typename pointer_element<FromPtr>::type,
+          void
+        >,
+        is_pointer_system_convertible<FromPtr, ToPtr>
       >
 {};
 
@@ -262,6 +364,15 @@ template<typename FromPtr, typename ToPtr>
       >
 {};
 
+template<typename FromPtr, typename ToPtr>
+  struct lazy_is_void_pointer_system_convertible
+    : thrust::detail::eval_if<
+        is_thrust_pointer<FromPtr>::value && is_thrust_pointer<ToPtr>::value,
+        is_void_pointer_system_convertible<FromPtr,ToPtr>,
+        thrust::detail::identity_<thrust::detail::false_type>
+      >
+{};
+
 template<typename FromPtr, typename ToPtr, typename T = void>
   struct enable_if_pointer_is_convertible
     : thrust::detail::enable_if<
@@ -270,7 +381,15 @@ template<typename FromPtr, typename ToPtr, typename T = void>
       >
 {};
 
+template<typename FromPtr, typename ToPtr, typename T = void>
+  struct enable_if_void_pointer_is_system_convertible
+    : thrust::detail::enable_if<
+        lazy_is_void_pointer_system_convertible<FromPtr,ToPtr>::type::value,
+        T
+      >
+{};
+
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/type_traits/result_of.h b/thrust/detail/type_traits/result_of.h
deleted file mode 100644
index 8177aec73..000000000
--- a/thrust/detail/type_traits/result_of.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-template<typename Signature, typename Enable = void> struct result_of;
-
-// specialization for unary invocations of things which have result_type
-template<typename Functor, typename Arg1>
-  struct result_of<
-    Functor(Arg1),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
-{
-  typedef typename Functor::result_type type;
-}; // end result_of
-
-// specialization for binary invocations of things which have result_type
-template<typename Functor, typename Arg1, typename Arg2>
-  struct result_of<
-    Functor(Arg1,Arg2),
-    typename thrust::detail::enable_if<thrust::detail::has_result_type<Functor>::value>::type
-  >
-{
-  typedef typename Functor::result_type type;
-};
-
-} // end detail
-} // end thrust
-
diff --git a/thrust/detail/type_traits/result_of_adaptable_function.h b/thrust/detail/type_traits/result_of_adaptable_function.h
new file mode 100644
index 000000000..edf797f14
--- /dev/null
+++ b/thrust/detail/type_traits/result_of_adaptable_function.h
@@ -0,0 +1,59 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/function_traits.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+namespace detail
+{
+
+// Sets `type` to the result of the specified Signature invocation. If the
+// callable defines a `result_type` alias member, that type is used instead.
+// Use invoke_result / result_of when FuncType::result_type is not defined.
+template <typename Signature, typename Enable = void>
+struct result_of_adaptable_function
+{
+private:
+  template <typename Sig> struct impl;
+
+  template <typename F, typename... Args>
+  struct impl<F(Args...)>
+  {
+    using type = invoke_result_t<F, Args...>;
+  };
+
+public:
+  using type = typename impl<Signature>::type;
+};
+
+// specialization for invocations which define result_type
+template <typename Functor, typename... ArgTypes>
+struct result_of_adaptable_function<
+  Functor(ArgTypes...),
+  typename thrust::detail::enable_if<
+    thrust::detail::has_result_type<Functor>::value>::type>
+{
+  using type = typename Functor::result_type;
+};
+
+} // namespace detail
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/uninitialized_copy.inl b/thrust/detail/uninitialized_copy.inl
index 660df76d5..2778693ad 100644
--- a/thrust/detail/uninitialized_copy.inl
+++ b/thrust/detail/uninitialized_copy.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file uninitialized_copy.inl
- *  \brief Inline file for uninitialized_copy.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/uninitialized_copy.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/uninitialized_copy.h>
 #include <thrust/system/detail/adl/uninitialized_copy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -92,6 +90,6 @@ template<typename InputIterator,
 } // end uninitialized_copy_n()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/detail/uninitialized_fill.inl b/thrust/detail/uninitialized_fill.inl
index 30eab23a2..e013dac7b 100644
--- a/thrust/detail/uninitialized_fill.inl
+++ b/thrust/detail/uninitialized_fill.inl
@@ -14,10 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file uninitialized_fill.inl
- *  \brief Inline file for uninitialized_fill.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/uninitialized_fill.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -25,8 +24,7 @@
 #include <thrust/system/detail/generic/uninitialized_fill.h>
 #include <thrust/system/detail/adl/uninitialized_fill.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -88,5 +86,5 @@ template<typename ForwardIterator,
 } // end uninitialized_fill_n()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/unique.inl b/thrust/detail/unique.inl
index b6fa9304d..ac5475f02 100644
--- a/thrust/detail/unique.inl
+++ b/thrust/detail/unique.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/unique.h>
@@ -28,8 +25,7 @@
 #include <thrust/system/detail/adl/unique.h>
 #include <thrust/system/detail/adl/unique_by_key.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 __thrust_exec_check_disable__
@@ -99,7 +95,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<ForwardIterator1,ForwardIterator2>
   unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_first,
                 ForwardIterator1 keys_last,
                 ForwardIterator2 values_first)
 {
@@ -116,7 +112,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<ForwardIterator1,ForwardIterator2>
   unique_by_key(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                ForwardIterator1 keys_first, 
+                ForwardIterator1 keys_first,
                 ForwardIterator1 keys_last,
                 ForwardIterator2 values_first,
                 BinaryPredicate binary_pred)
@@ -135,7 +131,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
+                     InputIterator1 keys_first,
                      InputIterator1 keys_last,
                      InputIterator2 values_first,
                      OutputIterator1 keys_output,
@@ -156,7 +152,7 @@ template<typename DerivedPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
   unique_by_key_copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
-                     InputIterator1 keys_first, 
+                     InputIterator1 keys_first,
                      InputIterator1 keys_last,
                      InputIterator2 values_first,
                      OutputIterator1 keys_output,
@@ -239,7 +235,7 @@ template<typename InputIterator,
 template<typename ForwardIterator1,
          typename ForwardIterator2>
   thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
+    unique_by_key(ForwardIterator1 keys_first,
                   ForwardIterator1 keys_last,
                   ForwardIterator2 values_first)
 {
@@ -259,7 +255,7 @@ template<typename ForwardIterator1,
          typename ForwardIterator2,
          typename BinaryPredicate>
   thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(ForwardIterator1 keys_first, 
+    unique_by_key(ForwardIterator1 keys_first,
                   ForwardIterator1 keys_last,
                   ForwardIterator2 values_first,
                   BinaryPredicate binary_pred)
@@ -281,7 +277,7 @@ template<typename InputIterator1,
          typename OutputIterator1,
          typename OutputIterator2>
   thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
+    unique_by_key_copy(InputIterator1 keys_first,
                        InputIterator1 keys_last,
                        InputIterator2 values_first,
                        OutputIterator1 keys_output,
@@ -309,7 +305,7 @@ template<typename InputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate>
   thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(InputIterator1 keys_first, 
+    unique_by_key_copy(InputIterator1 keys_first,
                        InputIterator1 keys_last,
                        InputIterator2 values_first,
                        OutputIterator1 keys_output,
@@ -331,6 +327,67 @@ template<typename InputIterator1,
   return thrust::unique_by_key_copy(select_system(system1,system2,system3,system4), keys_first, keys_last, values_first, keys_output, values_output, binary_pred);
 } // end unique_by_key_copy()
 
+__thrust_exec_check_disable__
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::unique_count;
+  return unique_count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last, binary_pred);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename DerivedPolicy,
+          typename ForwardIterator>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::unique_count;
+  return unique_count(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), first, last);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique_count(select_system(system), first, last, binary_pred);
+} // end unique_count()
+
+__thrust_exec_check_disable__
+template <typename ForwardIterator>
+__host__ __device__
+    typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last)
+{
+  using thrust::system::detail::generic::select_system;
+
+  typedef typename thrust::iterator_system<ForwardIterator>::type System;
+
+  System system;
+
+  return thrust::unique_count(select_system(system), first, last);
+} // end unique_count()
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/use_default.h b/thrust/detail/use_default.h
index ba2c27bc5..f25b6274c 100644
--- a/thrust/detail/use_default.h
+++ b/thrust/detail/use_default.h
@@ -18,10 +18,9 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 struct use_default {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/util/align.h b/thrust/detail/util/align.h
index af97cd44a..a3aa75bfe 100644
--- a/thrust/detail/util/align.h
+++ b/thrust/detail/util/align.h
@@ -17,12 +17,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/cstdint.h>
 
 // functions to handle memory alignment
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 namespace util
@@ -55,5 +56,5 @@ bool is_aligned(T * ptr, detail::uintptr_t bytes = sizeof(T))
 
 } // end namespace util
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/detail/util/blocking.h b/thrust/detail/util/blocking.h
deleted file mode 100644
index 7aedad9c5..000000000
--- a/thrust/detail/util/blocking.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-//functions to support blocking
-
-namespace thrust
-{
-
-namespace detail
-{
-
-namespace util
-{
-
-// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_ri(const L x, const R y)
-{
-    return (x + (y - 1)) / y;
-}
-
-// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_rz(const L x, const R y)
-{
-    return x / y;
-}
-
-// round x towards infinity to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
-
-// round x towards zero to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
-
-} // end namespace util
-
-} // end namespace detail
-
-} // end namespace thrust
-
diff --git a/thrust/detail/vector_base.h b/thrust/detail/vector_base.h
index 37ea3223d..0c4da449e 100644
--- a/thrust/detail/vector_base.h
+++ b/thrust/detail/vector_base.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,11 +26,13 @@
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/detail/config.h>
 #include <thrust/detail/contiguous_storage.h>
+
+#include <initializer_list>
 #include <vector>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -62,12 +64,24 @@ template<typename T, typename Alloc>
      */
     vector_base(void);
 
+    /*! This constructor creates an empty vector_base.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(const Alloc &alloc);
+
     /*! This constructor creates a vector_base with default-constructed
      *  elements.
      *  \param n The number of elements to create.
      */
     explicit vector_base(size_type n);
 
+    /*! This constructor creates a vector_base with default-constructed
+     *  elements.
+     *  \param n The number of elements to create.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(size_type n, const Alloc &alloc);
+
     /*! This constructor creates a vector_base with copies
      *  of an exemplar element.
      *  \param n The number of elements to initially create.
@@ -75,16 +89,63 @@ template<typename T, typename Alloc>
      */
     explicit vector_base(size_type n, const value_type &value);
 
+    /*! This constructor creates a vector_base with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    explicit vector_base(size_type n, const value_type &value, const Alloc &alloc);
+
     /*! Copy constructor copies from an exemplar vector_base.
      *  \param v The vector_base to copy.
      */
     vector_base(const vector_base &v);
 
-    /*! assign operator makes a copy of an exemplar vector_base.
+    /*! Copy constructor copies from an exemplar vector_base.
+     *  \param v The vector_base to copy.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    vector_base(const vector_base &v, const Alloc &alloc);
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another vector_base.
+     *  \param v The vector_base to move.
+     */
+    vector_base(vector_base &&v);
+
+    // FIXME: the internal Thrust machinery in range_init doesn't work with move
+    // iterators, which is necessary for the following constructor to be implemented
+    // correctly
+    // vector_base(vector_base &&v, const Alloc &alloc);
+  #endif
+
+    /*! Copy assign operator copies from another vector_base.
      *  \param v The vector_base to copy.
      */
     vector_base &operator=(const vector_base &v);
 
+    /*! Move assign operator moves from another vector_base.
+     *  \param v The vector_base to move.
+     */
+    vector_base &operator=(vector_base &&v);
+
+    /*! This constructor builds a \p vector_base from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    vector_base(std::initializer_list<T> il);
+      
+    /*! This constructor builds a \p vector_base from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    vector_base(std::initializer_list<T> il, const Alloc &alloc);
+    
+    /*! Assign operator copies from an initializer_list
+     *  \param il The initializer_list.
+     */
+    vector_base &operator=(std::initializer_list<T> il);
+
     /*! Copy constructor copies from an exemplar vector_base with different
      *  type.
      *  \param v The vector_base to copy.
@@ -124,6 +185,14 @@ template<typename T, typename Alloc>
     template<typename InputIterator>
     vector_base(InputIterator first, InputIterator last);
 
+    /*! This constructor builds a vector_base from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this vector_base.
+     */
+    template<typename InputIterator>
+    vector_base(InputIterator first, InputIterator last, const Alloc &alloc);
+
     /*! The destructor erases the elements.
      */
     ~vector_base(void);
@@ -153,11 +222,13 @@ template<typename T, typename Alloc>
 
     /*! Returns the number of elements in this vector_base.
      */
+    __host__ __device__
     size_type size(void) const;
 
     /*! Returns the size() of the largest possible vector_base.
      *  \return The largest possible return value of size().
      */
+    __host__ __device__
     size_type max_size(void) const;
 
     /*! \brief If n is less than or equal to capacity(), this call has no effect.
@@ -171,6 +242,7 @@ template<typename T, typename Alloc>
     /*! Returns the number of elements which have been reserved in this
      *  vector_base.
      */
+    __host__ __device__
     size_type capacity(void) const;
 
     /*! This method shrinks the capacity of this vector_base to exactly
@@ -186,6 +258,7 @@ template<typename T, typename Alloc>
      *  Note that data access with this operator is unchecked and
      *  out_of_range lookups are not defined.
      */
+    __host__ __device__
     reference operator[](size_type n);
 
     /*! \brief Subscript read access to the data contained in this vector_dev.
@@ -196,24 +269,28 @@ template<typename T, typename Alloc>
      *  Note that data access with this operator is unchecked and
      *  out_of_range lookups are not defined.
      */
+    __host__ __device__
     const_reference operator[](size_type n) const;
 
     /*! This method returns an iterator pointing to the beginning of
      *  this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     iterator begin(void);
 
     /*! This method returns a const_iterator pointing to the beginning
      *  of this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     const_iterator begin(void) const;
 
     /*! This method returns a const_iterator pointing to the beginning
      *  of this vector_base.
      *  \return mStart
      */
+    __host__ __device__
     const_iterator cbegin(void) const;
 
     /*! This method returns a reverse_iterator pointing to the beginning of
@@ -221,6 +298,7 @@ template<typename T, typename Alloc>
      *  \return A reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     reverse_iterator rbegin(void);
 
     /*! This method returns a const_reverse_iterator pointing to the beginning of
@@ -228,6 +306,7 @@ template<typename T, typename Alloc>
      *  \return A const_reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     const_reverse_iterator rbegin(void) const;
 
     /*! This method returns a const_reverse_iterator pointing to the beginning of
@@ -235,76 +314,89 @@ template<typename T, typename Alloc>
      *  \return A const_reverse_iterator pointing to the beginning of this
      *          vector_base's reversed sequence.
      */
+    __host__ __device__
     const_reverse_iterator crbegin(void) const;
 
     /*! This method returns an iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     iterator end(void);
 
     /*! This method returns a const_iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     const_iterator end(void) const;
 
     /*! This method returns a const_iterator pointing to one element past the
      *  last of this vector_base.
      *  \return begin() + size().
      */
+    __host__ __device__
     const_iterator cend(void) const;
 
     /*! This method returns a reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     reverse_iterator rend(void);
 
     /*! This method returns a const_reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     const_reverse_iterator rend(void) const;
 
     /*! This method returns a const_reverse_iterator pointing to one element past the
      *  last of this vector_base's reversed sequence.
      *  \return rbegin() + size().
      */
+    __host__ __device__
     const_reverse_iterator crend(void) const;
 
     /*! This method returns a const_reference referring to the first element of this
      *  vector_base.
      *  \return The first element of this vector_base.
      */
+    __host__ __device__
     const_reference front(void) const;
 
     /*! This method returns a reference pointing to the first element of this
      *  vector_base.
      *  \return The first element of this vector_base.
      */
+    __host__ __device__
     reference front(void);
 
     /*! This method returns a const reference pointing to the last element of
      *  this vector_base.
      *  \return The last element of this vector_base.
      */
+    __host__ __device__
     const_reference back(void) const;
 
     /*! This method returns a reference referring to the last element of
      *  this vector_dev.
      *  \return The last element of this vector_base.
      */
+    __host__ __device__
     reference back(void);
 
     /*! This method returns a pointer to this vector_base's first element.
      *  \return A pointer to the first element of this vector_base.
      */
+    __host__ __device__
     pointer data(void);
 
     /*! This method returns a const_pointer to this vector_base's first element.
      *  \return a const_pointer to the first element of this vector_base.
      */
+    __host__ __device__
     const_pointer data(void) const;
 
     /*! This method resizes this vector_base to 0.
@@ -314,6 +406,7 @@ template<typename T, typename Alloc>
     /*! This method returns true iff size() == 0.
      *  \return true if size() == 0; false, otherwise.
      */
+    __host__ __device__
     bool empty(void) const;
 
     /*! This method appends the given element to the end of this vector_base.
@@ -368,8 +461,8 @@ template<typename T, typename Alloc>
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -385,7 +478,7 @@ template<typename T, typename Alloc>
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -528,7 +621,7 @@ template<typename T1, typename Alloc1,
 bool operator!=(const std::vector<T1,Alloc1>&         lhs,
                 const detail::vector_base<T2,Alloc2>& rhs);
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/vector_base.inl>
 
diff --git a/thrust/detail/vector_base.inl b/thrust/detail/vector_base.inl
index f985e90f2..bdd6c1c7a 100644
--- a/thrust/detail/vector_base.inl
+++ b/thrust/detail/vector_base.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,11 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file vector_base.inl
- *  \brief Inline file for vector_base.h.
- */
-
+#include <thrust/detail/config.h>
 #include <thrust/detail/vector_base.h>
 #include <thrust/detail/copy.h>
 #include <thrust/detail/overlapped_copy.h>
@@ -32,8 +30,7 @@
 
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -47,6 +44,15 @@ template<typename T, typename Alloc>
   ;
 } // end vector_base::vector_base()
 
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  ;
+} // end vector_base::vector_base()
+
 template<typename T, typename Alloc>
   vector_base<T,Alloc>
     ::vector_base(size_type n)
@@ -56,6 +62,15 @@ template<typename T, typename Alloc>
   default_init(n);
 } // end vector_base::vector_base()
 
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  default_init(n);
+} // end vector_base::vector_base()
+
 template<typename T, typename Alloc>
   vector_base<T,Alloc>
     ::vector_base(size_type n, const value_type &value)
@@ -65,15 +80,44 @@ template<typename T, typename Alloc>
   fill_init(n,value);
 } // end vector_base::vector_base()
 
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(size_type n, const value_type &value, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  fill_init(n,value);
+} // end vector_base::vector_base()
+
 template<typename T, typename Alloc>
   vector_base<T,Alloc>
     ::vector_base(const vector_base &v)
-      :m_storage(),
+      :m_storage(copy_allocator_t(), v.m_storage),
        m_size(0)
 {
   range_init(v.begin(), v.end());
 } // end vector_base::vector_base()
 
+template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(const vector_base &v, const Alloc &alloc)
+      :m_storage(alloc),
+       m_size(0)
+{
+  range_init(v.begin(), v.end());
+} // end vector_base::vector_base()
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc>
+      ::vector_base(vector_base &&v)
+        :m_storage(copy_allocator_t(), v.m_storage),
+         m_size(0)
+  {
+    *this = std::move(v);
+  } //end vector_base::vector_base()
+#endif
+
 template<typename T, typename Alloc>
   vector_base<T,Alloc> &
     vector_base<T,Alloc>
@@ -81,12 +125,34 @@ template<typename T, typename Alloc>
 {
   if(this != &v)
   {
+    m_storage.destroy_on_allocator_mismatch(v.m_storage, begin(), end());
+    m_storage.deallocate_on_allocator_mismatch(v.m_storage);
+
+    m_storage.propagate_allocator(v.m_storage);
+
     assign(v.begin(), v.end());
   } // end if
 
   return *this;
 } // end vector_base::operator=()
 
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+        ::operator=(vector_base &&v)
+  {
+    m_storage.destroy(begin(), end());
+    m_storage = std::move(v.m_storage);
+    m_size = std::move(v.m_size);
+
+    v.m_storage = contiguous_storage<T,Alloc>(copy_allocator_t(), m_storage);
+    v.m_size = 0;
+
+    return *this;
+  } // end vector_base::operator=()
+#endif
+
 template<typename T, typename Alloc>
   template<typename OtherT, typename OtherAlloc>
     vector_base<T,Alloc>
@@ -129,6 +195,34 @@ template<typename T, typename Alloc>
   return *this;
 } // end vector_base::operator=()
 
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc>
+      ::vector_base(std::initializer_list<T> il)
+        :m_storage(),
+         m_size(0)
+  {
+    range_init(il.begin(), il.end());
+  } // end vector_base::vector_base()
+
+  template<typename T, typename Alloc>
+  vector_base<T,Alloc>
+    ::vector_base(std::initializer_list<T> il, const Alloc &alloc)
+    :m_storage(alloc),
+      m_size(0)
+  {
+    range_init(il.begin(), il.end());
+  } // end vector_base::vector_base()
+
+  template<typename T, typename Alloc>
+    vector_base<T,Alloc> &
+      vector_base<T,Alloc>
+      ::operator=(std::initializer_list<T> il)
+  {
+    assign(il.begin(), il.end());
+
+    return *this;
+  } // end vector_base::operator=()
+
 template<typename T, typename Alloc>
   template<typename IteratorOrIntegralType>
     void vector_base<T,Alloc>
@@ -222,7 +316,23 @@ template<typename T, typename Alloc>
   typedef thrust::detail::is_integral<InputIterator> Integer;
 
   init_dispatch(first, last, Integer());
-} // end vector_basee::vector_base()
+} // end vector_base::vector_base()
+
+template<typename T, typename Alloc>
+  template<typename InputIterator>
+    vector_base<T,Alloc>
+      ::vector_base(InputIterator first,
+                    InputIterator last,
+                    const Alloc &alloc)
+        :m_storage(alloc),
+         m_size(0)
+{
+  // check the type of InputIterator: if it's an integral type,
+  // we need to interpret this call as (size_type, value_type)
+  typedef thrust::detail::is_integral<InputIterator> Integer;
+
+  init_dispatch(first, last, Integer());
+} // end vector_base::vector_base()
 
 template<typename T, typename Alloc>
   void vector_base<T,Alloc>
@@ -257,6 +367,7 @@ template<typename T, typename Alloc>
 } // end vector_base::resize()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::size(void) const
@@ -265,6 +376,7 @@ template<typename T, typename Alloc>
 } // end vector_base::size()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::max_size(void) const
@@ -278,11 +390,43 @@ template<typename T, typename Alloc>
 {
   if(n > capacity())
   {
-    allocate_and_copy(n, begin(), end(), m_storage);
+    // compute the new capacity after the allocation
+    size_type new_capacity = n;
+
+    // do not exceed maximum storage
+    new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
+
+    // create new storage
+    storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
+
+    // record how many constructors we invoke in the try block below
+    iterator new_end = new_storage.begin();
+
+    try
+    {
+      // construct copy all elements into the newly allocated storage
+      new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
+    } // end try
+    catch(...)
+    {
+      // something went wrong, so destroy & deallocate the new storage
+      new_storage.destroy(new_storage.begin(), new_end);
+      new_storage.deallocate();
+
+      // rethrow
+      throw;
+    } // end catch
+
+    // call destructors on the elements in the old storage
+    m_storage.destroy(begin(), end());
+
+    // record the vector's new state
+    m_storage.swap(new_storage);
   } // end if
 } // end vector_base::reserve()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::size_type
     vector_base<T,Alloc>
       ::capacity(void) const
@@ -299,6 +443,7 @@ template<typename T, typename Alloc>
 } // end vector_base::shrink_to_fit()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::operator[](const size_type n)
@@ -307,7 +452,8 @@ template<typename T, typename Alloc>
 } // end vector_base::operator[]
 
 template<typename T, typename Alloc>
-  typename vector_base<T,Alloc>::const_reference 
+  __host__ __device__
+  typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::operator[](const size_type n) const
 {
@@ -315,6 +461,7 @@ template<typename T, typename Alloc>
 } // end vector_base::operator[]
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::iterator
     vector_base<T,Alloc>
       ::begin(void)
@@ -323,6 +470,7 @@ template<typename T, typename Alloc>
 } // end vector_base::begin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::begin(void) const
@@ -331,6 +479,7 @@ template<typename T, typename Alloc>
 } // end vector_base::begin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::cbegin(void) const
@@ -339,6 +488,7 @@ template<typename T, typename Alloc>
 } // end vector_base::cbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reverse_iterator
     vector_base<T,Alloc>
       ::rbegin(void)
@@ -347,6 +497,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::rbegin(void) const
@@ -355,6 +506,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::crbegin(void) const
@@ -363,6 +515,7 @@ template<typename T, typename Alloc>
 } // end vector_base::crbegin()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::iterator
     vector_base<T,Alloc>
       ::end(void)
@@ -373,6 +526,7 @@ template<typename T, typename Alloc>
 } // end vector_base::end()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::end(void) const
@@ -383,6 +537,7 @@ template<typename T, typename Alloc>
 } // end vector_base::end()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_iterator
     vector_base<T,Alloc>
       ::cend(void) const
@@ -391,6 +546,7 @@ template<typename T, typename Alloc>
 } // end vector_base::cend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reverse_iterator
     vector_base<T,Alloc>
       ::rend(void)
@@ -399,6 +555,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::rend(void) const
@@ -407,6 +564,7 @@ template<typename T, typename Alloc>
 } // end vector_base::rend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reverse_iterator
     vector_base<T,Alloc>
       ::crend(void) const
@@ -415,6 +573,7 @@ template<typename T, typename Alloc>
 } // end vector_base::crend()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::front(void) const
@@ -423,6 +582,7 @@ template<typename T, typename Alloc>
 } // end vector_base::front()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::front(void)
@@ -431,6 +591,7 @@ template<typename T, typename Alloc>
 } // end vector_base::front()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_reference
     vector_base<T,Alloc>
       ::back(void) const
@@ -441,6 +602,7 @@ template<typename T, typename Alloc>
 } // end vector_base::vector_base
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::reference
     vector_base<T,Alloc>
       ::back(void)
@@ -451,19 +613,21 @@ template<typename T, typename Alloc>
 } // end vector_base::vector_base
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::pointer
     vector_base<T,Alloc>
       ::data(void)
 {
-  return &front();
+  return pointer(&front());
 } // end vector_base::data()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   typename vector_base<T,Alloc>::const_pointer
     vector_base<T,Alloc>
       ::data(void) const
 {
-  return &front();
+  return const_pointer(&front());
 } // end vector_base::data()
 
 template<typename T, typename Alloc>
@@ -471,17 +635,19 @@ template<typename T, typename Alloc>
     ::~vector_base(void)
 {
   // destroy every living thing
-  m_storage.destroy(begin(),end());
+  if (!empty())
+    m_storage.destroy(begin(),end());
 } // end vector_base::~vector_base()
 
 template<typename T, typename Alloc>
   void vector_base<T,Alloc>
     ::clear(void)
 {
-  resize(0);
+  erase(begin(), end());
 } // end vector_base::~vector_dev()
 
 template<typename T, typename Alloc>
+  __host__ __device__
   bool vector_base<T,Alloc>
     ::empty(void) const
 {
@@ -711,7 +877,7 @@ template<typename T, typename Alloc>
         throw std::length_error("insert(): insertion exceeds max_size().");
       } // end if
 
-      storage_type new_storage(new_capacity);
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
 
       // record how many constructors we invoke in the try block below
       iterator new_end = new_storage.begin();
@@ -731,7 +897,7 @@ template<typename T, typename Alloc>
       } // end try
       catch(...)
       {
-        // something went wrong, so destroy & deallocate the new storage 
+        // something went wrong, so destroy & deallocate the new storage
         m_storage.destroy(new_storage.begin(), new_end);
         new_storage.deallocate();
 
@@ -779,7 +945,7 @@ template<typename T, typename Alloc>
       new_capacity = thrust::min THRUST_PREVENT_MACRO_SUBSTITUTION <size_type>(new_capacity, max_size());
 
       // create new storage
-      storage_type new_storage(new_capacity);
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
 
       // record how many constructors we invoke in the try block below
       iterator new_end = new_storage.begin();
@@ -790,13 +956,13 @@ template<typename T, typename Alloc>
         new_end = m_storage.uninitialized_copy(begin(), end(), new_storage.begin());
 
         // construct new elements to insert
-        m_storage.default_construct_n(new_end, n);
+        new_storage.default_construct_n(new_end, n);
         new_end += n;
       } // end try
       catch(...)
       {
-        // something went wrong, so destroy & deallocate the new storage 
-        m_storage.destroy(new_storage.begin(), new_end);
+        // something went wrong, so destroy & deallocate the new storage
+        new_storage.destroy(new_storage.begin(), new_end);
         new_storage.deallocate();
 
         // rethrow
@@ -879,7 +1045,7 @@ template<typename T, typename Alloc>
         throw std::length_error("insert(): insertion exceeds max_size().");
       } // end if
 
-      storage_type new_storage(new_capacity);
+      storage_type new_storage(copy_allocator_t(), m_storage, new_capacity);
 
       // record how many constructors we invoke in the try block below
       iterator new_end = new_storage.begin();
@@ -900,7 +1066,7 @@ template<typename T, typename Alloc>
       } // end try
       catch(...)
       {
-        // something went wrong, so destroy & deallocate the new storage 
+        // something went wrong, so destroy & deallocate the new storage
         m_storage.destroy(new_storage.begin(), new_end);
         new_storage.deallocate();
 
@@ -943,7 +1109,7 @@ template<typename T, typename Alloc>
   {
     *current = *first;
   } // end for
-  
+
   // either just the input was exhausted or both
   // the input and vector elements were exhausted
   if(first == last)
@@ -969,7 +1135,7 @@ template<typename T, typename Alloc>
 
   if(n > capacity())
   {
-    storage_type new_storage;
+    storage_type new_storage(copy_allocator_t(), m_storage);
     allocate_and_copy(n, first, last, new_storage);
 
     // call destructors on the elements in the old storage
@@ -994,7 +1160,7 @@ template<typename T, typename Alloc>
   {
     // range fits inside allocated storage, but some elements
     // have not been constructed yet
-    
+
     // XXX TODO we could possibly implement this with one call
     // to transform rather than copy + uninitialized_copy
 
@@ -1076,7 +1242,7 @@ template<typename T, typename Alloc>
   } // end try
   catch(...)
   {
-    // something went wrong, so destroy & deallocate the new storage 
+    // something went wrong, so destroy & deallocate the new storage
     // XXX seems like this destroys too many elements -- should just be last - first instead of requested_size
     iterator new_storage_end = new_storage.begin();
     thrust::advance(new_storage_end, requested_size);
@@ -1102,7 +1268,7 @@ template<typename T, typename Alloc>
 
 namespace detail
 {
-    
+
 // iterator tags match
 template <typename InputIterator1, typename InputIterator2>
 bool vector_equal(InputIterator1 first1, InputIterator1 last1,
@@ -1158,7 +1324,7 @@ bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
 {
     return lhs.size() == rhs.size() && detail::vector_equal(lhs.begin(), lhs.end(), rhs.begin());
 }
-    
+
 template<typename T1, typename Alloc1,
          typename T2, typename Alloc2>
 bool operator==(const detail::vector_base<T1,Alloc1>& lhs,
@@ -1182,7 +1348,7 @@ bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
 {
     return !(lhs == rhs);
 }
-    
+
 template<typename T1, typename Alloc1,
          typename T2, typename Alloc2>
 bool operator!=(const detail::vector_base<T1,Alloc1>& lhs,
@@ -1199,5 +1365,5 @@ bool operator!=(const std::vector<T1,Alloc1>&         lhs,
     return !(lhs == rhs);
 }
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/device_allocator.h b/thrust/device_allocator.h
index ca9c1eb17..f64c3854f 100644
--- a/thrust/device_allocator.h
+++ b/thrust/device_allocator.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,109 +15,126 @@
  */
 
 
-/*! \file device_allocator.h
- *  \brief An allocator which creates new elements in device memory
+/*! \file
+ *  \brief An allocator which creates new elements in memory accessible by
+ *  devices.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/device_new_allocator.h>
+#include <thrust/device_ptr.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/device_memory_resource.h>
+
 #include <limits>
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup memory_management_classes Memory Management Classes
+/** \addtogroup allocators Allocators
+ *  \ingroup memory_management
  *  \{
  */
 
-template<typename T> class device_allocator;
-
-/*! \p device_allocator<void> is a device memory allocator.
- *  This class is a specialization for \c void.
- *
- *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+/*! Memory resource adaptor that turns any memory resource that returns a fancy
+ *      with the same tag as \p device_ptr, and adapts it to a resource that returns
+ *      a \p device_ptr.
  */
-template<>
-  class device_allocator<void>
+template<typename Upstream>
+class device_ptr_memory_resource final
+    : public thrust::mr::memory_resource<
+        device_ptr<void>
+    >
 {
-  public:
-    /*! Type of element allocated, \c void. */
-    typedef void                              value_type;
-
-    /*! Pointer to allocation, \c device_ptr<void>. */
-    typedef device_ptr<void>                  pointer;
-
-    /*! \c const pointer to allocation, \c device_ptr<const void>. */
-    typedef device_ptr<const void>            const_pointer;
-
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
+    typedef typename Upstream::pointer upstream_ptr;
 
-    /*! Type of allocation difference, \c pointer::difference_type. */
-    typedef pointer::difference_type difference_type;
+public:
+    /*! Initialize the adaptor with the global instance of the upstream resource. Obtains
+     *      the global instance by calling \p get_global_resource.
+     */
+    __host__
+    device_ptr_memory_resource() : m_upstream(mr::get_global_resource<Upstream>())
+    {
+    }
 
-    /*! The \p rebind metafunction provides the type of a \p device_allocator
-     *  instantiated with another type.
+    /*! Initialize the adaptor with an upstream resource.
      *
-     *  \tparam U The other type to use for instantiation.
+     *  \param upstream the upstream memory resource to adapt.
      */
-    template<typename U>
-      struct rebind
+    __host__
+    device_ptr_memory_resource(Upstream * upstream) : m_upstream(upstream)
+    {
+    }
+
+    THRUST_NODISCARD __host__
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        return pointer(m_upstream->do_allocate(bytes, alignment).get());
+    }
+
+    __host__
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) override
     {
-      /*! The typedef \p other gives the type of the rebound \p device_allocator.
-       */
-      typedef device_allocator<U> other;
-    }; // end rebind
-}; // end device_allocator<void>
-
-/*! \p device_allocator is a device memory allocator.
- *  This implementation inherits from \p device_new_allocator.
+        m_upstream->do_deallocate(upstream_ptr(p.get()), bytes, alignment);
+    }
+
+private:
+    Upstream * m_upstream;
+};
+
+/*! \brief An allocator which creates new elements in memory accessible by
+ *         devices.
  *
- *  \see device_ptr
- *  \see device_new_allocator
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+ *  \see https://en.cppreference.com/w/cpp/named_req/Allocator
  */
 template<typename T>
-  class device_allocator
-    : public device_new_allocator<T>
+class device_allocator
+    : public thrust::mr::stateless_resource_allocator<
+        T,
+        device_ptr_memory_resource<device_memory_resource>
+    >
 {
-  public:
+    typedef thrust::mr::stateless_resource_allocator<
+        T,
+        device_ptr_memory_resource<device_memory_resource>
+    > base;
+
+public:
     /*! The \p rebind metafunction provides the type of a \p device_allocator
      *  instantiated with another type.
      *
-     *  \tparam U The other type to use for instantiation.
+     *  \tparam U the other type to use for instantiation.
      */
     template<typename U>
-      struct rebind
+    struct rebind
     {
-      /*! The typedef \p other gives the type of the rebound \p device_allocator.
-       */
-      typedef device_allocator<U> other;
-    }; // end rebind
+        /*! The typedef \p other gives the type of the rebound \p device_allocator.
+         */
+        typedef device_allocator<U> other;
+    };
 
-    /*! No-argument constructor has no effect.
-     */
+    /*! Default constructor has no effect. */
     __host__ __device__
-    inline device_allocator() {}
+    device_allocator() {}
 
-    /*! Copy constructor has no effect.
-     */
+    /*! Copy constructor has no effect. */
     __host__ __device__
-    inline device_allocator(device_allocator const&) {}
+    device_allocator(const device_allocator& other) : base(other) {}
 
-    /*! Constructor from other \p allocator has no effect.
-     */
+    /*! Constructor from other \p device_allocator has no effect. */
     template<typename U>
     __host__ __device__
-    inline device_allocator(device_allocator<U> const&) {}
-}; // end device_allocator
+    device_allocator(const device_allocator<U>& other) : base(other) {}
 
-/*! \}
- */
+    device_allocator & operator=(const device_allocator &) = default;
 
-} // end thrust
+    /*! Destructor has no effect. */
+    __host__ __device__
+    ~device_allocator() {}
+};
+
+/*! \} // allocators
+ */
 
+THRUST_NAMESPACE_END
diff --git a/thrust/device_delete.h b/thrust/device_delete.h
index ce822f09d..0811936fb 100644
--- a/thrust/device_delete.h
+++ b/thrust/device_delete.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_delete.h
- *  \brief Deletes variables in device memory
+/*! \file
+ *  \brief Deletes variables in device memory.
  */
 
 #pragma once
@@ -24,11 +23,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -47,10 +44,10 @@ template<typename T>
   inline void device_delete(thrust::device_ptr<T> ptr,
                             const size_t n = 1);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_delete.inl>
 
diff --git a/thrust/device_free.h b/thrust/device_free.h
index 38d4424c7..1cd305045 100644
--- a/thrust/device_free.h
+++ b/thrust/device_free.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_free.h
- *  \brief Deallocates storage allocated by \p device_malloc
+/*! \file 
+ *  \brief Deallocates storage allocated by \p device_malloc.
  */
 
 #pragma once
@@ -24,11 +23,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup deallocation_functions Deallocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -59,10 +56,10 @@ namespace thrust
  */
 inline void device_free(thrust::device_ptr<void> ptr);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_free.inl>
 
diff --git a/thrust/device_make_unique.h b/thrust/device_make_unique.h
new file mode 100644
index 000000000..cdb8c31d8
--- /dev/null
+++ b/thrust/device_make_unique.h
@@ -0,0 +1,60 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file device_make_unique.h
+ *  \brief A factory function for creating `unique_ptr`s to device objects.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/allocate_unique.h>
+#include <thrust/device_new.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_allocator.h>
+#include <thrust/detail/type_deduction.h>
+
+THRUST_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename... Args>
+__host__
+auto device_make_unique(Args&&... args)
+  THRUST_TRAILING_RETURN(decltype(
+    uninitialized_allocate_unique<T>(device_allocator<T>{})
+  ))
+{
+#if !defined(THRUST_DOXYGEN) // This causes Doxygen to choke for some reason.
+  // FIXME: This is crude - we construct an unnecessary T on the host for
+  // `device_new`. We need a proper dispatched `construct` algorithm to
+  // do this properly.
+  auto p = uninitialized_allocate_unique<T>(device_allocator<T>{});
+  device_new<T>(p.get(), T(THRUST_FWD(args)...));
+  return p;
+#endif
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/device_malloc.h b/thrust/device_malloc.h
index 75194491e..790ddbac3 100644
--- a/thrust/device_malloc.h
+++ b/thrust/device_malloc.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc.h
- *  \brief Allocates storage in device memory
+/*! \file
+ *  \brief Allocates storage in device memory.
  */
 
 #pragma once
@@ -25,11 +24,9 @@
 #include <thrust/device_ptr.h>
 #include <cstddef> // for std::size_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup allocation_functions Allocation Functions
- *  \ingroup memory_management_functions
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -94,10 +91,10 @@ inline thrust::device_ptr<void> device_malloc(const std::size_t n);
 template<typename T>
   inline thrust::device_ptr<T> device_malloc(const std::size_t n);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_malloc.inl>
 
diff --git a/thrust/device_malloc_allocator.h b/thrust/device_malloc_allocator.h
index 00939b73c..1b15045f2 100644
--- a/thrust/device_malloc_allocator.h
+++ b/thrust/device_malloc_allocator.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_malloc_allocator.h
- *  \brief An allocator which allocates storage with \p device_malloc
+/*! \file 
+ *  \brief An allocator which allocates storage with \p device_malloc.
  */
 
 #pragma once
@@ -29,15 +28,13 @@
 #include <limits>
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declarations to WAR circular #includes
 template<typename> class device_ptr;
 template<typename T> device_ptr<T> device_malloc(const std::size_t n);
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup allocators Allocators 
  *  \ingroup memory_management
  *  \{
  */
@@ -45,9 +42,13 @@ template<typename T> device_ptr<T> device_malloc(const std::size_t n);
 /*! \p device_malloc_allocator is a device memory allocator that employs the
  *  \p device_malloc function for allocation.
  *
+ *  \p device_malloc_allocator is deprecated in favor of <tt>thrust::mr</tt>
+ *      memory resource-based allocators.
+ *
  *  \see device_malloc
  *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+ *  \see device_allocator
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template<typename T>
   class device_malloc_allocator
@@ -104,12 +105,16 @@ template<typename T>
     __host__ __device__
     inline device_malloc_allocator(device_malloc_allocator<U> const&) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+    device_malloc_allocator & operator=(const device_malloc_allocator &) = default;
+#endif
+
     /*! Returns the address of an allocated object.
      *  \return <tt>&r</tt>.
      */
     __host__ __device__
     inline pointer address(reference r) { return &r; }
-    
+
     /*! Returns the address an allocated object.
      *  \return <tt>&r</tt>.
      */
@@ -142,6 +147,9 @@ template<typename T>
     __host__
     inline void deallocate(pointer p, size_type cnt)
     {
+      // silence unused parameter warning while still leaving the parameter name for Doxygen
+      (void)(cnt);
+
       device_free(p);
     } // end deallocate()
 
@@ -157,18 +165,16 @@ template<typename T>
      *  \return \c true
      */
     __host__ __device__
-    inline bool operator==(device_malloc_allocator const&) { return true; }
+    inline bool operator==(device_malloc_allocator const&) const { return true; }
 
     /*! Compares against another \p device_malloc_allocator for inequality.
      *  \return \c false
      */
     __host__ __device__
-    inline bool operator!=(device_malloc_allocator const &a) {return !operator==(a); }
+    inline bool operator!=(device_malloc_allocator const &a) const {return !operator==(a); }
 }; // end device_malloc_allocator
 
-/*! \}
+/*! \} // allocators
  */
 
-} // end thrust
-
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_new.h b/thrust/device_new.h
index 1ae4ce5a4..c615e58f2 100644
--- a/thrust/device_new.h
+++ b/thrust/device_new.h
@@ -27,11 +27,10 @@
 #include <cstddef>
 #include <thrust/device_ptr.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*!
- *  \addtogroup allocation_functions Allocation Functions
+ *  \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -79,10 +78,9 @@ template <typename T>
 template <typename T>
   device_ptr<T> device_new(const size_t n = 1);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_new.inl>
-
diff --git a/thrust/device_new_allocator.h b/thrust/device_new_allocator.h
index bc3b7cd2e..c9c6b0e95 100644
--- a/thrust/device_new_allocator.h
+++ b/thrust/device_new_allocator.h
@@ -14,9 +14,8 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_new_allocator.h
- *  \brief An allocator which allocates storage with \p device_new
+/*! \file 
+ *  \brief An allocator which allocates storage with \p device_new.
  */
 
 #pragma once
@@ -26,14 +25,15 @@
 #include <thrust/device_reference.h>
 #include <thrust/device_new.h>
 #include <thrust/device_delete.h>
-#include <limits>
+
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+
 #include <stdexcept>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
+/*! \addtogroup allocators Allocators
  *  \ingroup memory_management
  *  \{
  */
@@ -43,7 +43,7 @@ namespace thrust
  *
  *  \see device_new
  *  \see device_ptr
- *  \see http://www.sgi.com/tech/stl/Allocators.html
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template<typename T>
   class device_new_allocator
@@ -64,8 +64,8 @@ template<typename T>
     /*! \c const reference to allocated element, \c device_reference<const T>. */
     typedef device_reference<const T>         const_reference;
 
-    /*! Type of allocation size, \c std::size_t. */
-    typedef std::size_t                       size_type;
+    /*! Type of allocation size, \c ::cuda::std::size_t. */
+    typedef ::cuda::std::size_t                 size_type;
 
     /*! Type of allocation difference, \c pointer::difference_type. */
     typedef typename pointer::difference_type difference_type;
@@ -140,6 +140,7 @@ template<typename T>
     inline void deallocate(pointer p, size_type cnt)
     {
       // use "::operator delete" rather than keyword delete
+      (void)cnt;
       device_delete(p);
     } // end deallocate()
 
@@ -149,7 +150,7 @@ template<typename T>
     __host__ __device__
     inline size_type max_size() const
     {
-      return std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
+      return ::cuda::std::numeric_limits<size_type>::max THRUST_PREVENT_MACRO_SUBSTITUTION () / sizeof(T);
     } // end max_size()
 
     /*! Compares against another \p device_malloc_allocator for equality.
@@ -165,8 +166,7 @@ template<typename T>
     inline bool operator!=(device_new_allocator const &a) {return !operator==(a); }
 }; // end device_new_allocator
 
-/*! \}
+/*! \} // allocators
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_ptr.h b/thrust/device_ptr.h
index e209319ed..5ef4aa464 100644
--- a/thrust/device_ptr.h
+++ b/thrust/device_ptr.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_ptr.h
- *  \brief A pointer to a variable which resides in the "device" system's memory space
+/*! \file
+ *  \brief A pointer to an object which resides in memory associated with the
+ *  \c device system.
  */
 
 #pragma once
@@ -24,151 +24,188 @@
 #include <thrust/detail/config.h>
 #include <thrust/memory.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
  *  \{
  */
 
-// forward declarations
-template<typename T> class device_reference;
+template <typename T> class device_reference;
 
-/*! \p device_ptr stores a pointer to an object allocated in device memory. This type
- *  provides type safety when dispatching standard algorithms on ranges resident in
- *  device memory.
+/*! \brief \c device_ptr is a pointer-like object which points to an object that
+ *  resides in memory associated with the \ref device system.
  *
- *  \p device_ptr has pointer semantics: it may be dereferenced safely from the host and
- *  may be manipulated with pointer arithmetic.
+ *  \c device_ptr has pointer semantics: it may be dereferenced safely from
+ *  anywhere, including the \ref host, and may be manipulated with pointer
+ *  arithmetic.
  *
- *  \p device_ptr can be created with the functions device_malloc, device_new, or
- *  device_pointer_cast, or by explicitly calling its constructor with a raw pointer.
+ *  \c device_ptr can be created with \ref device_new, \ref device_malloc,
+ *  \ref device_malloc_allocator, \ref device_allocator, or
+ *  \ref device_pointer_cast, or by explicitly calling its constructor with a
+ *  raw pointer.
  *
- *  The raw pointer encapsulated by a \p device_ptr may be obtained by either its <tt>get</tt>
- *  method or the \p raw_pointer_cast free function.
+ *  The raw pointer contained in a \c device_ptr may be obtained via \c get
+ *  member function or the \ref raw_pointer_cast free function.
  *
- *  \note \p device_ptr is not a smart pointer; it is the programmer's responsibility to
- *  deallocate memory pointed to by \p device_ptr.
+ *  \ref algorithms operating on \c device_ptr types will automatically be
+ *  dispatched to the \ref device system.
+ *
+ *  \note \c device_ptr is not a smart pointer; it is the programmer's
+ *  responsibility to deallocate memory pointed to by \c device_ptr.
  *
- *  \see device_malloc
  *  \see device_new
+ *  \see device_malloc
+ *  \see device_malloc_allocator
+ *  \see device_allocator
  *  \see device_pointer_cast
  *  \see raw_pointer_cast
  */
-template<typename T>
-  class device_ptr
-    : public thrust::pointer<
-               T,
-               thrust::device_system_tag,
-               thrust::device_reference<T>,
-               thrust::device_ptr<T>
-             >
+template <typename T>
+class device_ptr
+  : public thrust::pointer<
+      T,
+      thrust::device_system_tag,
+      thrust::device_reference<T>,
+      thrust::device_ptr<T>
+    >
 {
   private:
-    typedef thrust::pointer<
+    using super_t = thrust::pointer<
       T,
       thrust::device_system_tag,
       thrust::device_reference<T>,
       thrust::device_ptr<T>
-    > super_t;
+    >;
 
   public:
-    /*! \p device_ptr's null constructor initializes its raw pointer to \c 0.
+    /*! \brief Construct a null \c device_ptr.
+     *
+     *  \post <tt>get() == nullptr</tt>.
      */
     __host__ __device__
     device_ptr() : super_t() {}
 
-    /*! \p device_ptr's copy constructor is templated to allow copying to a
-     *  <tt>device_ptr<const T></tt> from a <tt>T *</tt>.
-     *  
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in
-     *         device memory.
+    /*! \brief Construct a null \c device_ptr.
+     *
+     *  \param ptr A null pointer.
+     *
+     *  \post <tt>get() == nullptr</tt>.
      */
-    template<typename OtherT>
     __host__ __device__
-    explicit device_ptr(OtherT *ptr) : super_t(ptr) {}
-
-    /*! \p device_ptr's copy constructor allows copying from another device_ptr with related type.
-     *  \param other The \p device_ptr to copy from.
+    device_ptr(std::nullptr_t) : super_t(nullptr) {}
+
+    /*! \brief Construct a \c device_ptr from a raw pointer which is
+     *  convertible to \c T*.
+     *
+     *  \tparam U   A type whose pointer is convertible to \c T*.
+     *  \param  ptr A raw pointer to a \c U in device memory to construct from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \pre \c ptr points to a location in device memory.
+     *
+     *  \post <tt>get() == nullptr</tt>.
      */
-    template<typename OtherT>
+    template <typename U>
     __host__ __device__
-    device_ptr(const device_ptr<OtherT> &other) : super_t(other) {}
-
-    /*! \p device_ptr's assignment operator allows assigning from another \p device_ptr with related type.
-     *  \param other The other \p device_ptr to copy from.
-     *  \return <tt>*this</tt>
+    explicit device_ptr(U* ptr) : super_t(ptr) {}
+
+    /*! \brief Copy construct a \c device_ptr from another \c device_ptr whose
+     *  pointer type is convertible to \c T*.
+     *
+     *  \tparam U     A type whose pointer is convertible to \c T*.
+     *  \param  other A \c device_ptr to a \c U to construct from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \post <tt>get() == other.get()</tt>.
+     */
+    template <typename U>
+    __host__ __device__
+    device_ptr(device_ptr<U> const& other) : super_t(other) {}
+
+    /*! \brief Set this \c device_ptr to point to the same object as another
+     *  \c device_ptr whose pointer type is convertible to \c T*.
+     *
+     *  \tparam U     A type whose pointer is convertible to \c T*.
+     *  \param  other A \c device_ptr to a \c U to assign from.
+     *
+     *  \pre <tt>std::is_convertible_v<U*, T*> == true</tt>.
+     *
+     *  \post <tt>get() == other.get()</tt>.
+     *
+     *  \return \c *this.
      */
-    template<typename OtherT>
+    template <typename U>
     __host__ __device__
-    device_ptr &operator=(const device_ptr<OtherT> &other)
+    device_ptr &operator=(device_ptr<U> const& other)
     {
       super_t::operator=(other);
       return *this;
     }
 
-// declare these members for the purpose of Doxygenating them
-// they actually exist in a derived-from class
-#if 0
-    /*! This method returns this \p device_ptr's raw pointer.
-     *  \return This \p device_ptr's raw pointer.
+    /*! \brief Set this \c device_ptr to null.
+     *
+     *  \param ptr A null pointer.
+     *
+     *  \post <tt>get() == nullptr</tt>.
+     *
+     *  \return \c *this.
      */
     __host__ __device__
-    T *get(void) const;
-#endif // end doxygen-only members
-}; // end device_ptr
-
-// declare these methods for the purpose of Doxygenating them
-// they actually are defined for a derived-from class
-#if 0
-/*! Writes to an output stream the value of a \p device_ptr's raw pointer.
+    device_ptr& operator=(std::nullptr_t)
+    {
+      super_t::operator=(nullptr);
+      return *this;
+    }
+
+#if THRUST_DOXYGEN
+    /*! \brief Return the raw pointer that this \c device_ptr points to.
+     */
+    __host__ __device__
+    T* get() const;
+#endif
+};
+
+#if THRUST_DOXYGEN
+/*! Write the address that a \c device_ptr points to to an output stream.
  *
  *  \param os The output stream.
- *  \param p The \p device_ptr to output.
- *  \return os.
+ *  \param dp The \c device_ptr to output.
+ *
+ *  \return \c os.
  */
-template<typename T, typename charT, typename traits>
-std::basic_ostream<charT, traits> &
-operator<<(std::basic_ostream<charT, traits> &os, const device_ptr<T> &p);
+template <typename T, typename CharT, typename Traits>
+__host__ std::basic_ostream<CharT, Traits>&
+operator<<(std::basic_ostream<CharT, Traits>& os, device_ptr<T> const& dp);
 #endif
 
-/*! \}
- */
-
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p device_pointer_cast creates a device_ptr from a raw pointer which is presumed to point
- *  to a location in device memory.
+/*! \brief Create a \c device_ptr from a raw pointer.
  *
- *  \param ptr A raw pointer, presumed to point to a location in device memory.
- *  \return A device_ptr wrapping ptr.
+ *  \tparam T   Any type.
+ *  \param  ptr A raw pointer to a \c T in device memory.
+ *
+ *  \pre \c ptr points to a location in device memory.
+ *
+ *  \return A \c device_ptr<T> pointing to \c ptr.
  */
-template<typename T>
+template <typename T>
 __host__ __device__
-inline device_ptr<T> device_pointer_cast(T *ptr);
+device_ptr<T> device_pointer_cast(T* ptr);
 
-/*! This version of \p device_pointer_cast creates a copy of a device_ptr from another device_ptr.
- *  This version is included for symmetry with \p raw_pointer_cast.
+/*! \brief Create a \c device_ptr from another \c device_ptr.
  *
- *  \param ptr A device_ptr.
- *  \return A copy of \p ptr.
+ *  \tparam T    Any type.
+ *  \param  dptr A \c device_ptr to a \c T.
  */
 template<typename T>
 __host__ __device__
-inline device_ptr<T> device_pointer_cast(const device_ptr<T> &ptr);
+device_ptr<T> device_pointer_cast(device_ptr<T> const& dptr);
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/device_ptr.inl>
 #include <thrust/detail/raw_pointer_cast.h>
-
diff --git a/thrust/device_reference.h b/thrust/device_reference.h
index 331ee8922..512ab4c60 100644
--- a/thrust/device_reference.h
+++ b/thrust/device_reference.h
@@ -14,9 +14,9 @@
  *  limitations under the License.
  */
 
-
-/*! \file device_reference.h
- *  \brief A reference to a variable which resides in the "device" system's memory space
+/*! \file 
+ *  \brief A reference to an object which resides in memory associated with the
+ *  device system.
  */
 
 #pragma once
@@ -26,11 +26,9 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/reference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
+/*! \addtogroup memory_management Memory Management
  *  \{
  */
 
@@ -38,7 +36,7 @@ namespace thrust
  *  \p device_reference is not intended to be used directly; rather, this type
  *  is the result of deferencing a \p device_ptr. Similarly, taking the address of
  *  a \p device_reference yields a \p device_ptr.
- *  
+ *
  *  \p device_reference may often be used from host code in place of operations defined on
  *  its associated \c value_type. For example, when \p device_reference refers to an
  *  arithmetic type, arithmetic operations on it are legal:
@@ -158,7 +156,7 @@ namespace thrust
  *    return 0;
  *  }
  *  \endcode
- *  
+ *
  *  Another common case where a \p device_reference cannot directly be used in place of
  *  its referent object occurs when passing them as parameters to functions like \c printf
  *  which have varargs parameters. Because varargs parameters must be Plain Old Data, a
@@ -209,7 +207,7 @@ template<typename T>
     /*! This copy constructor accepts a const reference to another
      *  \p device_reference. After this \p device_reference is constructed,
      *  it shall refer to the same object as \p other.
-     *  
+     *
      *  \param other A \p device_reference to copy from.
      *
      *  The following code snippet demonstrates the semantics of this
@@ -233,7 +231,7 @@ template<typename T>
      *  assert(ref == 13);
      *  \endcode
      *
-     *  \note This constructor is templated primarily to allow initialization of 
+     *  \note This constructor is templated primarily to allow initialization of
      *  <tt>device_reference<const T></tt> from <tt>device_reference<T></tt>.
      */
     template<typename OtherT>
@@ -289,16 +287,22 @@ template<typename T>
      */
     template<typename OtherT>
     __host__ __device__
-    device_reference &operator=(const device_reference<OtherT> &other);
+    device_reference &operator=(const device_reference<OtherT> &other)
+    {
+      return super_t::operator=(other);
+    }
 
     /*! Assignment operator assigns the value of the given value to the
      *  value referenced by this \p device_reference.
-     *  
+     *
      *  \param x The value to assign from.
      *  \return <tt>*this</tt>
      */
     __host__ __device__
-    device_reference &operator=(const value_type &x);
+    device_reference &operator=(const value_type &x)
+    {
+      return super_t::operator=(x);
+    }
 
 // declare these members for the purpose of Doxygenating them
 // they actually exist in a derived-from class
@@ -332,7 +336,7 @@ template<typename T>
      *  \p device_reference.
      *
      *  \return <tt>*this</tt>
-     *  
+     *
      *  The following code snippet demonstrates the semantics of
      *  \p device_reference's prefix increment operator.
      *
@@ -467,7 +471,7 @@ template<typename T>
      *  \p device_reference.
      *
      *  \return <tt>*this</tt>
-     *  
+     *
      *  The following code snippet demonstrates the semantics of
      *  \p device_reference's prefix decrement operator.
      *
@@ -958,11 +962,14 @@ template<typename T>
  */
 template<typename T>
 __host__ __device__
-void swap(device_reference<T> &x, device_reference<T> &y);
+void swap(device_reference<T>& x, device_reference<T>& y)
+{
+  x.swap(y);
+}
 
 // declare these methods for the purpose of Doxygenating them
 // they actually are defined for a derived-from class
-#if 0
+#if THRUST_DOXYGEN
 /*! Writes to an output stream the value of a \p device_reference.
  *
  *  \param os The output stream.
@@ -974,10 +981,7 @@ std::basic_ostream<charT, traits> &
 operator<<(std::basic_ostream<charT, traits> &os, const device_reference<T> &y);
 #endif
 
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
-
-#include <thrust/detail/device_reference.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/device_vector.h b/thrust/device_vector.h
index af4d98ba1..9b97e8d70 100644
--- a/thrust/device_vector.h
+++ b/thrust/device_vector.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,26 +15,24 @@
  */
 
 
-/*! \file device_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "device" memory space
+/*! \file
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to devices.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/device_malloc_allocator.h>
 #include <thrust/detail/vector_base.h>
-#include <vector>
+#include <thrust/device_allocator.h>
 
-namespace thrust
-{
+#include <initializer_list>
+#include <vector>
+#include <utility>
 
-// forward declaration of host_vector
-template<typename T, typename Alloc> class host_vector;
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup container_classes Container Classes
- *  \addtogroup device_containers Device Containers
- *  \ingroup container_classes
+/*! \addtogroup containers Containers
  *  \{
  */
 
@@ -42,13 +40,15 @@ template<typename T, typename Alloc> class host_vector;
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p device_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p device_vector resides in the memory
- *  space of a parallel device.
+ *  automatic. The memory associated with a \p device_vector resides in the
+ *  memory accessible to devices.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see device_allocator
  *  \see host_vector
+ *  \see universal_vector
  */
-template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
+template<typename T, typename Alloc = thrust::device_allocator<T> >
   class device_vector
     : public detail::vector_base<T,Alloc>
 {
@@ -65,54 +65,106 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
 
     /*! This constructor creates an empty \p device_vector.
      */
-    __host__
     device_vector(void)
       :Parent() {}
 
+    /*! This constructor creates an empty \p device_vector.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(const Alloc &alloc)
+      :Parent(alloc) {}
+
     /*! The destructor erases the elements.
      */
     //  Define an empty destructor to explicitly specify
     //  its execution space qualifier, as a workaround for nvcc warning
-    __host__
     ~device_vector(void) {}
 
     /*! This constructor creates a \p device_vector with the given
      *  size.
      *  \param n The number of elements to initially create.
      */
-    __host__
     explicit device_vector(size_type n)
       :Parent(n) {}
 
+    /*! This constructor creates a \p device_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    explicit device_vector(size_type n, const Alloc &alloc)
+      :Parent(n,alloc) {}
+
     /*! This constructor creates a \p device_vector with copies
      *  of an exemplar element.
      *  \param n The number of elements to initially create.
      *  \param value An element to copy.
      */
-    __host__
     explicit device_vector(size_type n, const value_type &value)
       :Parent(n,value) {}
 
+    /*! This constructor creates a \p device_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    explicit device_vector(size_type n, const value_type &value, const Alloc &alloc)
+      :Parent(n,value,alloc) {}
+
     /*! Copy constructor copies from an exemplar \p device_vector.
      *  \param v The \p device_vector to copy.
      */
-    __host__
     device_vector(const device_vector &v)
       :Parent(v) {}
 
+    /*! Copy constructor copies from an exemplar \p device_vector.
+     *  \param v The \p device_vector to copy.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(const device_vector &v, const Alloc &alloc)
+      :Parent(v,alloc) {}
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     */
+    device_vector(device_vector &&v)
+      :Parent(std::move(v)) {}
+
+    /*! Move constructor moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(device_vector &&v, const Alloc &alloc)
+      :Parent(std::move(v), alloc) {}
+  #endif // THRUST_CPP_DIALECT >= 2011
+
+    /*! Copy assign operator copies another \p device_vector with the same type.
+     *  \param v The \p device_vector to copy.
+     */
+    device_vector &operator=(const device_vector &v)
+    { Parent::operator=(v); return *this; }
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move assign operator moves from another \p device_vector.
+     *  \param v The device_vector to move.
+     */
+     device_vector &operator=(device_vector &&v)
+     { Parent::operator=(std::move(v)); return *this; }
+  #endif // THRUST_CPP_DIALECT >= 2011
+
     /*! Copy constructor copies from an exemplar \p device_vector with different type.
      *  \param v The \p device_vector to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __device__
-    device_vector(const device_vector<OtherT,OtherAlloc> &v)
+    explicit device_vector(const device_vector<OtherT,OtherAlloc> &v)
       :Parent(v) {}
 
     /*! Assign operator copies from an exemplar \p device_vector with different type.
      *  \param v The \p device_vector to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __device__
     device_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
 
@@ -120,7 +172,6 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      *  \param v The <tt>std::vector</tt> to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
     device_vector(const std::vector<OtherT,OtherAlloc> &v)
       :Parent(v) {}
 
@@ -128,34 +179,61 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      *  \param v The <tt>std::vector</tt> to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
     device_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this;}
 
-    /*! Copy constructor copies from an exemplar \p host_vector with possibly different type.
-     *  \param v The \p host_vector to copy.
+    /*! Copy construct from a \p vector_base whose element type is convertible
+     *  to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector(const host_vector<OtherT,OtherAlloc> &v);
+    device_vector(const detail::vector_base<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
 
-    /*! Assign operator copies from an examplar \p host_vector.
-     *  \param v The \p host_vector to copy.
+    /*! Assign a \p vector_base whose element type is convertible to \c T.
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
-    __host__
-    device_vector &operator=(const host_vector<OtherT,OtherAlloc> &v)
+    device_vector &operator=(const detail::vector_base<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
 
+    /*! This constructor builds a \p device_vector from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    device_vector(std::initializer_list<T> il)
+      :Parent(il) {}
+      
+    /*! This constructor builds a \p device_vector from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    device_vector(std::initializer_list<T> il, const Alloc &alloc)
+      :Parent(il, alloc) {}
+      
+    /*! Assign an \p intializer_list with a matching element type
+     *  \param il The intializer_list.
+     */
+    device_vector &operator=(std::initializer_list<T> il)
+    { Parent::operator=(il); return *this; }
+
     /*! This constructor builds a \p device_vector from a range.
      *  \param first The beginning of the range.
      *  \param last The end of the range.
      */
     template<typename InputIterator>
-    __host__
     device_vector(InputIterator first, InputIterator last)
       :Parent(first,last) {}
 
+    /*! This constructor builds a \p device_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this device_vector.
+     */
+    template<typename InputIterator>
+    device_vector(InputIterator first, InputIterator last, const Alloc &alloc)
+      :Parent(first,last,alloc) {}
+
 // declare these members for the purpose of Doxygenating them
 // they actually exist in a derived-from class
 #if 0
@@ -346,7 +424,7 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      */
     void pop_back(void);
 
-    /*! This method swaps the contents of this vector_base with another vector.
+    /*! This method swaps the contents of this device_vector with another vector.
      *  \param v The vector with which to swap.
      */
     void swap(device_vector &v);
@@ -372,7 +450,7 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      *  \param x The exemplar element to copy & insert.
      *  \return An iterator pointing to the newly inserted element.
      */
-    iterator insert(iterator position, const T &x); 
+    iterator insert(iterator position, const T &x);
 
     /*! This method inserts a copy of an exemplar value to a range at the
      *  specified position in this vector.
@@ -388,8 +466,8 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -405,7 +483,7 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -415,13 +493,19 @@ template<typename T, typename Alloc = thrust::device_malloc_allocator<T> >
      */
     allocator_type get_allocator(void) const;
 #endif // end doxygen-only members
-}; // end device_vector
+};
 
-/*! \}
+/*! Exchanges the values of two vectors.
+ *  \p x The first \p device_vector of interest.
+ *  \p y The second \p device_vector of interest.
  */
+template<typename T, typename Alloc>
+  void swap(device_vector<T,Alloc> &a, device_vector<T,Alloc> &b)
+{
+  a.swap(b);
+}
 
-} // end thrust
-
-#include <thrust/detail/device_vector.inl>
-
+/*! \} // containres
+ */
 
+THRUST_NAMESPACE_END
diff --git a/thrust/distance.h b/thrust/distance.h
index 6dd4800be..890879115 100644
--- a/thrust/distance.h
+++ b/thrust/distance.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -40,7 +38,7 @@ namespace thrust
  *  \param last The end of an input range of interest.
  *  \return The distance between the beginning and end of the input range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \pre If \c InputIterator meets the requirements of random access iterator, \p last shall be reachable from \p first or
  *       \p first shall be reachable from \p last; otherwise, \p last shall be reachable from \p first.
@@ -61,7 +59,7 @@ namespace thrust
  *  // d is 7
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/distance.html
+ *  \see https://en.cppreference.com/w/cpp/iterator/distance
  */
 template<typename InputIterator>
 inline __host__ __device__
@@ -71,7 +69,6 @@ inline __host__ __device__
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/distance.inl>
-
diff --git a/thrust/equal.h b/thrust/equal.h
index bc6db5015..2f3518907 100644
--- a/thrust/equal.h
+++ b/thrust/equal.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -52,11 +50,11 @@ namespace thrust
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p equal to test
@@ -74,7 +72,7 @@ namespace thrust
  *  // result == false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2>
 __host__ __device__
@@ -93,11 +91,11 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  \param first2 The beginning of the second sequence.
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator1's \c value_type can be compared for equality with \c InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and \p InputIterator2's \c value_type can be compared for equality with \c InputIterator1's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p equal to test
@@ -114,7 +112,7 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  // result == false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template <typename InputIterator1, typename InputIterator2>
 bool equal(InputIterator1 first1, InputIterator1 last1,
@@ -139,11 +137,11 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p equal to compare the
  *  elements in two ranges modulo 2 using the \p thrust::host execution policy.
@@ -170,7 +168,7 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  // result is false
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename BinaryPredicate>
 __host__ __device__
@@ -191,11 +189,11 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  \param binary_pred Binary predicate used to test element equality.
  *  \return \c true, if the sequences are equal; \c false, otherwise.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p equal to compare the
  *  elements in two ranges modulo 2.
@@ -220,7 +218,7 @@ bool equal(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, Inp
  *  // result is true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/equal.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/equal
  */
 template <typename InputIterator1, typename InputIterator2, 
           typename BinaryPredicate>
@@ -232,7 +230,6 @@ bool equal(InputIterator1 first1, InputIterator1 last1,
  *  \} // end reductions
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/equal.inl>
-
diff --git a/thrust/system/cuda/detail/bulk/iterator.hpp b/thrust/event.h
similarity index 77%
rename from thrust/system/cuda/detail/bulk/iterator.hpp
rename to thrust/event.h
index 606d28b8e..75578d964 100644
--- a/thrust/system/cuda/detail/bulk/iterator.hpp
+++ b/thrust/event.h
@@ -14,8 +14,13 @@
  *  limitations under the License.
  */
 
+/*! \file thrust/event.h
+ *  \brief `thrust::event`, an asynchronous handle type.
+ */
+
 #pragma once
 
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp> 
+#include <thrust/future.h>
+
+// TODO: Actually separate `<thrust/future.h>` into two headers.
 
diff --git a/thrust/execution_policy.h b/thrust/execution_policy.h
index d86a6c163..ecf14413f 100644
--- a/thrust/execution_policy.h
+++ b/thrust/execution_policy.h
@@ -25,6 +25,8 @@
 #include <thrust/detail/execute_with_allocator.h>
 #include <thrust/detail/seq.h>
 
+//! \cond
+
 // #include the host system's execution_policy header
 #define __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER <__THRUST_HOST_SYSTEM_ROOT/execution_policy.h>
 #include __THRUST_HOST_SYSTEM_EXECUTION_POLICY_HEADER
@@ -35,9 +37,9 @@
 #include __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
 #undef __THRUST_DEVICE_SYSTEM_EXECUTION_POLICY_HEADER
 
-namespace thrust
-{
+//! \endcond
 
+THRUST_NAMESPACE_BEGIN
 
 /*! \cond
  */
@@ -280,10 +282,9 @@ template<typename DerivedPolicy>
  *    }
  *  };
  *  ...
- *  int vec(3);
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
+ *  int vec[] = { 0, 1, 2 };
  *
- *  thrust::for_each(thrust::host, vec.begin(), vec.end(), printf_functor());
+ *  thrust::for_each(thrust::host, vec, vec + 3, printf_functor());
  *
  *  // 0 1 2 is printed to standard output in some unspecified order
  *  \endcode
@@ -340,11 +341,7 @@ static const detail::host_t host;
  *  \see host_execution_policy
  *  \see thrust::device
  */
-#ifdef __CUDA_ARCH__
-static const __device__ detail::device_t device;
-#else
-static const detail::device_t device;
-#endif
+THRUST_INLINE_CONSTANT detail::device_t device;
 
 
 // define seq for the purpose of Doxygenating it
@@ -392,5 +389,4 @@ static const detail::seq_t seq;
  */
 
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/extrema.h b/thrust/extrema.h
index c9fd016cc..ca419a0aa 100644
--- a/thrust/extrema.h
+++ b/thrust/extrema.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! This version of \p min returns the smaller of two values, given a comparison operation.
  *  \param lhs The first value to compare.
@@ -35,7 +33,7 @@ namespace thrust
  *  \return The smaller element.
  *
  *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min to compute the smaller of two
  *  key-value objects.
@@ -80,7 +78,7 @@ __host__ __device__
  *  \param rhs The second value to compare.
  *  \return The smaller element.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p min to compute the smaller of two
  *  integers.
@@ -111,7 +109,7 @@ __host__ __device__
  *  \return The larger element.
  *
  *  \tparam T is convertible to \p BinaryPredicate's first argument type and to its second argument type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">BinaryPredicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max to compute the larger of two
  *  key-value objects.
@@ -156,7 +154,7 @@ __host__ __device__
  *  \param rhs The second value to compare.
  *  \return The larger element.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p max to compute the larger of two
  *  integers.
@@ -207,9 +205,9 @@ __host__ __device__
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -222,7 +220,7 @@ __host__ __device__
  *  // *result is 0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -246,9 +244,9 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -260,7 +258,7 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *result is 0
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template <typename ForwardIterator>
 ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
@@ -288,10 +286,10 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min_element to find the smallest element
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
@@ -325,7 +323,7 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last);
  *  // *smallest == {0,7}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
 __host__ __device__
@@ -350,10 +348,10 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the smallest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p min_element to find the smallest element
  *  of a collection of key-value pairs.
@@ -385,7 +383,7 @@ ForwardIterator min_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *smallest == {0,7}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/min_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/min_element 
  */
 template <typename ForwardIterator, typename BinaryPredicate>
 ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
@@ -413,9 +411,9 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam A Thrust backend system.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -427,7 +425,7 @@ ForwardIterator min_element(ForwardIterator first, ForwardIterator last,
  *  // *result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -451,9 +449,9 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -464,7 +462,7 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template <typename ForwardIterator>
 ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
@@ -492,10 +490,10 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max_element to find the largest element
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization.
@@ -529,7 +527,7 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last);
  *  // *largest == {6,1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename BinaryPredicate>
 __host__ __device__
@@ -554,10 +552,10 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  \return An iterator pointing to the largest element of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p max_element to find the largest element
  *  of a collection of key-value pairs.
@@ -589,7 +587,7 @@ ForwardIterator max_element(const thrust::detail::execution_policy_base<DerivedP
  *  // *largest == {6,1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/max_element.html 
+ *  \see https://en.cppreference.com/w/cpp/algorithm/max_element 
  */
 template <typename ForwardIterator, typename BinaryPredicate>
 ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
@@ -610,9 +608,9 @@ ForwardIterator max_element(ForwardIterator first, ForwardIterator last,
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -646,9 +644,9 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detai
  *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \c ForwardIterator's \c value_type is a model of
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  \code
  *  #include <thrust/extrema.h>
@@ -686,10 +684,10 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator fir
  *          if it is not an empty range; \p last, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
  *  of a collection of key-value pairs using the \p thrust::host execution policy for parallelization:
@@ -746,10 +744,10 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(const thrust::detai
  *  \return A pair of iterator pointing to the smallest and largest elements of the range <tt>[first, last)</tt>,
  *          if it is not an empty range; \p last, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \p comp's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p minmax_element to find the smallest and largest elements
  *  of a collection of key-value pairs.
@@ -797,8 +795,7 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(ForwardIterator fir
  *  \} // end reductions
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/extrema.inl>
 #include <thrust/detail/minmax.h>
-
diff --git a/thrust/fill.h b/thrust/fill.h
index 850313802..bd9e40268 100644
--- a/thrust/fill.h
+++ b/thrust/fill.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \addtogroup filling
@@ -48,9 +46,9 @@ namespace thrust
  *  \param value The value to be copied.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -67,7 +65,7 @@ namespace thrust
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill
  *  \see \c fill_n
  *  \see \c uninitialized_fill
  */
@@ -88,9 +86,9 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param value The value to be copied.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -106,7 +104,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill
  *  \see \c fill_n
  *  \see \c uninitialized_fill
  */
@@ -131,8 +129,8 @@ __host__ __device__
  *  \return <tt>first + n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -149,7 +147,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill_n
  *  \see \c fill
  *  \see \c uninitialized_fill_n
  */
@@ -171,8 +169,8 @@ __host__ __device__
  *  \param value The value to be copied.
  *  \return <tt>first + n</tt>
  *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T's \c value_type is convertible to a type in \p OutputIterator's set of \c value_type.
  *
  *  The following code snippet demonstrates how to use \p fill to set a thrust::device_vector's
@@ -188,7 +186,7 @@ __host__ __device__
  *  // v[0] == 137, v[1] == 137, v[2] == 137, v[3] == 137
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/fill_n.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/fill_n
  *  \see \c fill
  *  \see \c uninitialized_fill_n
  */
@@ -203,7 +201,6 @@ __host__ __device__
  *  \} // transformations
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/fill.inl>
-
diff --git a/thrust/find.h b/thrust/find.h
index 6e992499e..5ab9b0a2d 100644
--- a/thrust/find.h
+++ b/thrust/find.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -50,9 +48,9 @@ namespace thrust
  *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">EqualityComparable</a>. 
  *
  *  \code
  *  #include <thrust/find.h>
@@ -93,9 +91,9 @@ InputIterator find(const thrust::detail::execution_policy_base<DerivedPolicy> &e
  *  \param value The value to find.
  *  \return The first iterator \c i such that <tt>*i == value</tt> or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator's \c value_type is equality comparable to type \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">EqualityComparable</a>. 
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">EqualityComparable</a>. 
  *
  *  \code
  *  #include <thrust/find.h>
@@ -137,8 +135,8 @@ InputIterator find(InputIterator first,
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -200,8 +198,8 @@ InputIterator find_if(const thrust::detail::execution_policy_base<DerivedPolicy>
  *  \param pred A predicate used to test range elements.
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c true, or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -263,8 +261,8 @@ InputIterator find_if(InputIterator first,
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -326,8 +324,8 @@ InputIterator find_if_not(const thrust::detail::execution_policy_base<DerivedPol
  *  \param pred A predicate used to test range elements.
  *  \return The first iterator \c i such that <tt>pred(*i)</tt> is \c false, or \c last.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/find.h>
@@ -378,8 +376,6 @@ InputIterator find_if_not(InputIterator first,
 /*! \} // end searching
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/find.inl>
-
diff --git a/thrust/for_each.h b/thrust/for_each.h
index 0eb305aee..7d05e3ea1 100644
--- a/thrust/for_each.h
+++ b/thrust/for_each.h
@@ -14,7 +14,7 @@
  */
 
 
-/*! \file for_each.h
+/*! \file thrust/for_each.h
  *  \brief Applies a function to each element in a range
  */
 
@@ -24,9 +24,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup modifying
  *  \ingroup transformations
@@ -50,13 +48,13 @@ namespace thrust
  *  \return last
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each to print the elements
- *  of a \p std::device_vector using the \p thrust::device parallelization policy:
+ *  of a \p thrust::device_vector using the \p thrust::device parallelization policy:
  *
  *  \code
  *  #include <thrust/for_each.h>
@@ -86,7 +84,7 @@ namespace thrust
  *  \endcode
  *
  *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -113,10 +111,10 @@ InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \return <tt>first + n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
  *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each_n to print the elements
@@ -136,7 +134,7 @@ InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy
  *      // note that using printf in a __device__ function requires
  *      // code compiled for a GPU with compute capability 2.0 or
  *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -149,7 +147,7 @@ InputIterator for_each(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \endcode
  *
  *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -173,9 +171,9 @@ InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPoli
  *  \param f The function object to apply to the range <tt>[first, last)</tt>.
  *  \return last
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each to print the elements
@@ -194,7 +192,7 @@ InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPoli
  *      // note that using printf in a __device__ function requires
  *      // code compiled for a GPU with compute capability 2.0 or
  *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -207,7 +205,7 @@ InputIterator for_each_n(const thrust::detail::execution_policy_base<DerivedPoli
  *  \endcode
  *
  *  \see for_each_n
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename InputIterator,
          typename UnaryFunction>
@@ -227,10 +225,10 @@ InputIterator for_each(InputIterator first,
  *  \param f The function object to apply to the range <tt>[first, first + n)</tt>.
  *  \return <tt>first + n</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
  *  \tparam Size is an integral type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction does not apply any non-constant operation through its argument.
  *
  *  The following code snippet demonstrates how to use \p for_each_n to print the elements
@@ -249,7 +247,7 @@ InputIterator for_each(InputIterator first,
  *      // note that using printf in a __device__ function requires
  *      // code compiled for a GPU with compute capability 2.0 or
  *      // higher (nvcc --arch=sm_20)
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -262,7 +260,7 @@ InputIterator for_each(InputIterator first,
  *  \endcode
  *
  *  \see for_each
- *  \see http://www.sgi.com/tech/stl/for_each.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/for_each
  */
 template<typename InputIterator,
          typename Size,
@@ -274,7 +272,7 @@ InputIterator for_each_n(InputIterator first,
 /*! \} // end modifying
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/for_each.inl>
 
diff --git a/thrust/functional.h b/thrust/functional.h
index c8caf4f7c..0608f4b3d 100644
--- a/thrust/functional.h
+++ b/thrust/functional.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -25,8 +25,7 @@
 #include <functional>
 #include <thrust/detail/functional/placeholder.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup function_objects Function Objects
  */
@@ -47,7 +46,7 @@ template<typename Operation> struct binary_traits;
  *  Unary Function must define nested \c typedefs. Those \c typedefs are
  *  provided by the base class \p unary_function.
  *
- *  The following code snippet demonstrates how to construct an 
+ *  The following code snippet demonstrates how to construct an
  *  Adaptable Unary Function using \p unary_function.
  *
  *  \code
@@ -58,11 +57,11 @@ template<typename Operation> struct binary_traits;
  *  };
  *  \endcode
  *
- *  \note unary_function is currently redundant with the C++ STL type
- *  \c std::unary_function. We reserve it here for potential additional
- *  functionality at a later date.
+ *  \note Because C++11 language support makes the functionality of
+ *        \c unary_function obsolete, its use is optional if C++11 language
+ *        features are enabled.
  *
- *  \see http://www.sgi.com/tech/stl/unary_function.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/unary_function
  *  \see binary_function
  */
 template<typename Argument,
@@ -87,7 +86,7 @@ struct unary_function
  *  Binary Function must define nested \c typedefs. Those \c typedefs are
  *  provided by the base class \p binary_function.
  *
- *  The following code snippet demonstrates how to construct an 
+ *  The following code snippet demonstrates how to construct an
  *  Adaptable Binary Function using \p binary_function.
  *
  *  \code
@@ -98,11 +97,11 @@ struct unary_function
  *  };
  *  \endcode
  *
- *  \note binary_function is currently redundant with the C++ STL type
- *  \c std::binary_function. We reserve it here for potential additional
- *  functionality at a later date.
+ *  \note Because C++11 language support makes the functionality of
+ *        \c binary_function obsolete, its use is optional if C++11 language
+ *        features are enabled.
  *
- *  \see http://www.sgi.com/tech/stl/binary_function.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/binary_function
  *  \see unary_function
  */
 template<typename Argument1,
@@ -139,11 +138,46 @@ struct binary_function
  *  \{
  */
 
+#define THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                   \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T>                                                      \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T&& x) const                                     \
+      noexcept(noexcept(impl)) THRUST_TRAILING_RETURN(decltype(impl))          \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(func, impl)                  \
+  template <>                                                                  \
+  struct func<void>                                                            \
+  {                                                                            \
+    using is_transparent = void;                                               \
+    __thrust_exec_check_disable__                                              \
+    template <typename T1, typename T2>                                        \
+    __host__ __device__                                                        \
+    constexpr auto operator()(T1&& t1, T2&& t2) const                          \
+      noexcept(noexcept(impl)) THRUST_TRAILING_RETURN(decltype(impl))          \
+    {                                                                          \
+      return impl;                                                             \
+    }                                                                          \
+  }
+
+#define THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(func, op)                 \
+  THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(                                   \
+    func, THRUST_FWD(t1) op THRUST_FWD(t2))
+
+
 /*! \p plus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>plus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x+y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x+y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>plus</tt> to sum two
@@ -165,14 +199,14 @@ struct binary_function
  *  thrust::fill(V2.begin(), V2.end(), 75);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::plus<float>());
+ *                    thrust::plus<float>());
  *  // V3 is now {76, 77, 78, ..., 1075}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/plus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/plus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct plus
 {
   /*! \typedef first_argument_type
@@ -192,14 +226,21 @@ struct plus
 
   /*! Function call operator. The return value is <tt>lhs + rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs + rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs + rhs;
+  }
 }; // end plus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(plus, +);
+
 /*! \p minus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x-y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x-y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>minus</tt> to subtract
@@ -221,14 +262,14 @@ struct plus
  *  thrust::fill(V2.begin(), V2.end(), 75);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::minus<float>());
- *  // V3 is now {-74, -75, -76, ..., -925}
+ *                    thrust::minus<float>());
+ *  // V3 is now {-74, -73, -72, ..., 925}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/minus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/minus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct minus
 {
   /*! \typedef first_argument_type
@@ -248,14 +289,21 @@ struct minus
 
   /*! Function call operator. The return value is <tt>lhs - rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs - rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs - rhs;
+  }
 }; // end minus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(minus, -);
+
 /*! \p multiplies is a function object. Specifically, it is an Adaptable Binary Function.
- *  If \c f is an object of class <tt>minus<T></tt>, and \c x and \c y are objects
+ *  If \c f is an object of class <tt>multiplies<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x*y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x*y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>multiplies</tt> to multiply
@@ -277,14 +325,14 @@ struct minus
  *  thrust::fill(V2.begin(), V2.end(), 75);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::multiplies<float>());
+ *                    thrust::multiplies<float>());
  *  // V3 is now {75, 150, 225, ..., 75000}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/multiplies.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/multiplies
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct multiplies
 {
   /*! \typedef first_argument_type
@@ -304,14 +352,21 @@ struct multiplies
 
   /*! Function call operator. The return value is <tt>lhs * rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs * rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs * rhs;
+  }
 }; // end multiplies
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(multiplies, *);
+
 /*! \p divides is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>divides<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x/y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x/y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>divides</tt> to divide
@@ -333,14 +388,14 @@ struct multiplies
  *  thrust::fill(V2.begin(), V2.end(), 75);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::divides<float>());
+ *                    thrust::divides<float>());
  *  // V3 is now {1/75, 2/75, 3/75, ..., 1000/75}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/divides.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/divides
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct divides
 {
   /*! \typedef first_argument_type
@@ -360,14 +415,21 @@ struct divides
 
   /*! Function call operator. The return value is <tt>lhs / rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs / rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs / rhs;
+  }
 }; // end divides
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(divides, /);
+
 /*! \p modulus is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>modulus<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x \% y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x \% y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>modulus</tt> to take
@@ -389,14 +451,14 @@ struct divides
  *  thrust::fill(V2.begin(), V2.end(), 75);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(), V3.begin(),
- *                     thrust::modulus<int>());
+ *                    thrust::modulus<int>());
  *  // V3 is now {1%75, 2%75, 3%75, ..., 1000%75}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/modulus.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/modulus
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct modulus
 {
   /*! \typedef first_argument_type
@@ -416,18 +478,25 @@ struct modulus
 
   /*! Function call operator. The return value is <tt>lhs % rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs % rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs % rhs;
+  }
 }; // end modulus
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(modulus, %);
+
 /*! \p negate is a function object. Specifically, it is an Adaptable Unary Function.
  *  If \c f is an object of class <tt>negate<T></tt>, and \c x is an object
  *  of class \c T, then <tt>f(x)</tt> returns <tt>-x</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p T, then <tt>-x</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>negate</tt> to negate
- *  the element of a device_vector of \c floats.
+ *  the elements of a device_vector of \c floats.
  *
  *  \code
  *  #include <thrust/device_vector.h>
@@ -442,14 +511,14 @@ struct modulus
  *  thrust::sequence(V1.begin(), V1.end(), 1);
  *
  *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
- *                     thrust::negate<float>());
+ *                    thrust::negate<float>());
  *  // V2 is now {-1, -2, -3, ..., -1000}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/negate
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct negate
 {
   /*! \typedef argument_type
@@ -464,9 +533,70 @@ struct negate
 
   /*! Function call operator. The return value is <tt>-x</tt>.
    */
-  __host__ __device__ T operator()(const T &x) const {return -x;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return -x;
+  }
 }; // end negate
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(negate, -THRUST_FWD(x));
+
+/*! \p square is a function object. Specifically, it is an Adaptable Unary Function.
+ *  If \c f is an object of class <tt>square<T></tt>, and \c x is an object
+ *  of class \c T, then <tt>f(x)</tt> returns <tt>x*x</tt>.
+ *
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
+ *          and if \c x is an object of type \p T, then <tt>x*x</tt> must be defined and must have a return type that is convertible to \c T.
+ *
+ *  The following code snippet demonstrates how to use <tt>square</tt> to square
+ *  the elements of a device_vector of \c floats.
+ *
+ *  \code
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/functional.h>
+ *  #include <thrust/sequence.h>
+ *  #include <thrust/transform.h>
+ *  ...
+ *  const int N = 1000;
+ *  thrust::device_vector<float> V1(N);
+ *  thrust::device_vector<float> V2(N);
+ *
+ *  thrust::sequence(V1.begin(), V1.end(), 1);
+ *
+ *  thrust::transform(V1.begin(), V1.end(), V2.begin(),
+ *                    thrust::square<float>());
+ *  // V2 is now {1, 4, 9, ..., 1000000}
+ *  \endcode
+ *
+ *  \see unary_function
+ */
+template<typename T = void>
+struct square
+{
+  /*! \typedef argument_type
+   *  \brief The type of the function object's argument.
+   */
+  typedef T argument_type;
+
+  /*! \typedef result_type
+   *  \brief The type of the function object's result;
+   */
+  typedef T result_type;
+
+  /*! Function call operator. The return value is <tt>x*x</tt>.
+   */
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &x) const
+  {
+    return x*x;
+  }
+}; // end square
+
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(square, x*x);
+
 /*! \}
  */
 
@@ -481,12 +611,12 @@ struct negate
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x == y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/equal_to.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/equal_to
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct equal_to
 {
   /*! \typedef first_argument_type
@@ -506,21 +636,28 @@ struct equal_to
 
   /*! Function call operator. The return value is <tt>lhs == rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs == rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs == rhs;
+  }
 }; // end equal_to
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(equal_to, ==);
+
 /*! \p not_equal_to is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>not_equal_to<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x != y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/not_equal_to.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/not_equal_to
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct not_equal_to
 {
   /*! \typedef first_argument_type
@@ -540,21 +677,28 @@ struct not_equal_to
 
   /*! Function call operator. The return value is <tt>lhs != rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs != rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs != rhs;
+  }
 }; // end not_equal_to
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(not_equal_to, !=);
+
 /*! \p greater is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x > y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/greater.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/greater
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct greater
 {
   /*! \typedef first_argument_type
@@ -574,21 +718,28 @@ struct greater
 
   /*! Function call operator. The return value is <tt>lhs > rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs > rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs > rhs;
+  }
 }; // end greater
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater, >);
+
 /*! \p less is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x < y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/less.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/less
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct less
 {
   /*! \typedef first_argument_type
@@ -608,21 +759,28 @@ struct less
 
   /*! Function call operator. The return value is <tt>lhs < rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs < rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs;
+  }
 }; // end less
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less, <);
+
 /*! \p greater_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>greater_equal<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x >= y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/greater_equal.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/greater_equal
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct greater_equal
 {
   /*! \typedef first_argument_type
@@ -642,21 +800,28 @@ struct greater_equal
 
   /*! Function call operator. The return value is <tt>lhs >= rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs >= rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs >= rhs;
+  }
 }; // end greater_equal
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(greater_equal, >=);
+
 /*! \p less_equal is a function object. Specifically, it is an Adaptable Binary
  *  Predicate, which means it is a function object that tests the truth or falsehood
  *  of some condition. If \c f is an object of class <tt>less_equal<T></tt> and \c x
  *  and \c y are objects of class \c T, then <tt>f(x,y)</tt> returns \c true if
  *  <tt>x <= y</tt> and \c false otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
- *  \see http://www.sgi.com/tech/stl/less_equal.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/less_equal
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct less_equal
 {
   /*! \typedef first_argument_type
@@ -676,9 +841,16 @@ struct less_equal
 
   /*! Function call operator. The return value is <tt>lhs <= rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs <= rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs <= rhs;
+  }
 }; // end less_equal
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(less_equal, <=);
+
 /*! \}
  */
 
@@ -696,10 +868,10 @@ struct less_equal
  *
  *  \tparam T must be convertible to \c bool.
  *
- *  \see http://www.sgi.com/tech/stl/logical_and.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_and
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_and
 {
   /*! \typedef first_argument_type
@@ -719,9 +891,16 @@ struct logical_and
 
   /*! Function call operator. The return value is <tt>lhs && rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs && rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs && rhs;
+  }
 }; // end logical_and
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_and, &&);
+
 /*! \p logical_or is a function object. Specifically, it is an Adaptable Binary Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_or<T></tt> and \c x and \c y are objects of
@@ -730,10 +909,10 @@ struct logical_and
  *
  *  \tparam T must be convertible to \c bool.
  *
- *  \see http://www.sgi.com/tech/stl/logical_or.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_or
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_or
 {
   /*! \typedef first_argument_type
@@ -753,9 +932,16 @@ struct logical_or
 
   /*! Function call operator. The return value is <tt>lhs || rhs</tt>.
    */
-  __host__ __device__ bool operator()(const T &lhs, const T &rhs) const {return lhs || rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs || rhs;
+  }
 }; // end logical_or
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(logical_or, ||);
+
 /*! \p logical_not is a function object. Specifically, it is an Adaptable Predicate,
  *  which means it is a function object that tests the truth or falsehood of some condition.
  *  If \c f is an object of class <tt>logical_not<T></tt> and \c x is an object of
@@ -778,10 +964,10 @@ struct logical_or
  *  // The elements of V are now the logical complement of what they were prior
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/logical_not.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/logical_not
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct logical_not
 {
   /*! \typedef first_argument_type
@@ -801,9 +987,16 @@ struct logical_not
 
   /*! Function call operator. The return value is <tt>!x</tt>.
    */
-  __host__ __device__ bool operator()(const T &x) const {return !x;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool operator()(const T &x) const
+  {
+    return !x;
+  }
 }; // end logical_not
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(logical_not, !THRUST_FWD(x));
+
 /*! \}
  */
 
@@ -816,7 +1009,7 @@ struct logical_not
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x&y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x&y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_and</tt> to take
@@ -844,7 +1037,7 @@ struct logical_not
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_and
 {
   /*! \typedef first_argument_type
@@ -864,14 +1057,21 @@ struct bit_and
 
   /*! Function call operator. The return value is <tt>lhs & rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs & rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs & rhs;
+  }
 }; // end bit_and
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_and, &);
+
 /*! \p bit_or is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x|y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x|y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_or</tt> to take
@@ -899,7 +1099,7 @@ struct bit_and
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_or
 {
   /*! \typedef first_argument_type
@@ -919,14 +1119,21 @@ struct bit_or
 
   /*! Function call operator. The return value is <tt>lhs | rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs | rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs | rhs;
+  }
 }; // end bit_or
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_or, |);
+
 /*! \p bit_xor is a function object. Specifically, it is an Adaptable Binary Function.
  *  If \c f is an object of class <tt>bit_and<T></tt>, and \c x and \c y are objects
  *  of class \c T, then <tt>f(x,y)</tt> returns <tt>x^y</tt>.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x and \c y are objects of type \p T, then <tt>x^y</tt> must be defined and must have a return type that is convertible to \c T.
  *
  *  The following code snippet demonstrates how to use <tt>bit_xor</tt> to take
@@ -954,7 +1161,7 @@ struct bit_or
  *
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct bit_xor
 {
   /*! \typedef first_argument_type
@@ -974,9 +1181,16 @@ struct bit_xor
 
   /*! Function call operator. The return value is <tt>lhs ^ rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs ^ rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs ^ rhs;
+  }
 }; // end bit_xor
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP(bit_xor, ^);
+
 /*! \}
  */
 
@@ -1002,10 +1216,10 @@ struct bit_xor
  *  assert(x == id(x));
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/identity.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/identity
  *  \see unary_function
  */
-template<typename T>
+template<typename T = void>
 struct identity
 {
   /*! \typedef argument_type
@@ -1020,15 +1234,22 @@ struct identity
 
   /*! Function call operator. The return value is <tt>x</tt>.
    */
-  __host__ __device__ const T &operator()(const T &x) const {return x;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator()(const T &x) const
+  {
+    return x;
+  }
 }; // end identity
 
+THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION(identity, THRUST_FWD(x));
+
 /*! \p maximum is a function object that takes two arguments and returns the greater
  *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
  *  object of class <tt>maximum<T></tt> and \c x and \c y are objects of class \c T
  *  <tt>f(x,y)</tt> returns \c x if <tt>x > y</tt> and \c y, otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates that \p maximum returns its
  *  greater argument.
@@ -1047,7 +1268,7 @@ struct identity
  *  \see min
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct maximum
 {
   /*! \typedef first_argument_type
@@ -1067,15 +1288,24 @@ struct maximum
 
   /*! Function call operator. The return value is <tt>rhs < lhs ? lhs : rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? rhs : lhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? rhs : lhs;
+  }
 }; // end maximum
 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(maximum,
+                                          t1 < t2 ? THRUST_FWD(t2)
+                                                  : THRUST_FWD(t1));
+
 /*! \p minimum is a function object that takes two arguments and returns the lesser
  *  of the two. Specifically, it is an Adaptable Binary Function. If \c f is an
  *  object of class <tt>minimum<T></tt> and \c x and \c y are objects of class \c T
  *  <tt>f(x,y)</tt> returns \c x if <tt>x < y</tt> and \c y, otherwise.
  *
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates that \p minimum returns its
  *  lesser argument.
@@ -1094,7 +1324,7 @@ struct maximum
  *  \see max
  *  \see binary_function
  */
-template<typename T>
+template<typename T = void>
 struct minimum
 {
   /*! \typedef first_argument_type
@@ -1114,10 +1344,19 @@ struct minimum
 
   /*! Function call operator. The return value is <tt>lhs < rhs ? lhs : rhs</tt>.
    */
-  __host__ __device__ T operator()(const T &lhs, const T &rhs) const {return lhs < rhs ? lhs : rhs;}
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr T operator()(const T &lhs, const T &rhs) const
+  {
+    return lhs < rhs ? lhs : rhs;
+  }
 }; // end minimum
 
-/*! \p project1st is a function object that takes two arguments and returns 
+THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION(minimum,
+                                          t1 < t2 ? THRUST_FWD(t1)
+                                                  : THRUST_FWD(t2));
+
+/*! \p project1st is a function object that takes two arguments and returns
  *  its first argument; the second argument is unused. It is essentially a
  *  generalization of identity to the case of a Binary Function.
  *
@@ -1135,7 +1374,7 @@ struct minimum
  *  \see project2nd
  *  \see binary_function
  */
-template<typename T1, typename T2>
+template<typename T1 = void, typename T2 = void>
 struct project1st
 {
   /*! \typedef first_argument_type
@@ -1155,10 +1394,29 @@ struct project1st
 
   /*! Function call operator. The return value is <tt>lhs</tt>.
    */
-  __host__ __device__ const T1 &operator()(const T1 &lhs, const T2 &rhs) const {return lhs;}
+  __host__ __device__
+  constexpr const T1 &operator()(const T1 &lhs, const T2 & /*rhs*/) const
+  {
+    return lhs;
+  }
 }; // end project1st
 
-/*! \p project2nd is a function object that takes two arguments and returns 
+template <>
+struct project1st<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&& t1, T2&&) const
+    noexcept(noexcept(THRUST_FWD(t1)))
+    THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t1)))
+  {
+    return THRUST_FWD(t1);
+  }
+};
+
+/*! \p project2nd is a function object that takes two arguments and returns
  *  its second argument; the first argument is unused. It is essentially a
  *  generalization of identity to the case of a Binary Function.
  *
@@ -1176,7 +1434,7 @@ struct project1st
  *  \see project1st
  *  \see binary_function
  */
-template<typename T1, typename T2>
+template<typename T1 = void, typename T2 = void>
 struct project2nd
 {
   /*! \typedef first_argument_type
@@ -1196,13 +1454,31 @@ struct project2nd
 
   /*! Function call operator. The return value is <tt>rhs</tt>.
    */
-  __host__ __device__ const T2 &operator()(const T1 &lhs, const T2 &rhs) const {return rhs;}
+  __host__ __device__
+  constexpr const T2 &operator()(const T1 &/*lhs*/, const T2 &rhs) const
+  {
+    return rhs;
+  }
 }; // end project2nd
 
+template <>
+struct project2nd<void, void>
+{
+  using is_transparent = void;
+  __thrust_exec_check_disable__
+  template <typename T1, typename T2>
+  __host__ __device__
+  constexpr auto operator()(T1&&, T2&& t2) const
+  noexcept(noexcept(THRUST_FWD(t2)))
+  THRUST_TRAILING_RETURN(decltype(THRUST_FWD(t2)))
+  {
+    return THRUST_FWD(t2);
+  }
+};
+
 /*! \}
  */
 
-
 // odds and ends
 
 /*! \addtogroup function_object_adaptors
@@ -1217,11 +1493,11 @@ struct project2nd
  *  There is rarely any reason to construct a <tt>unary_negate</tt> directly;
  *  it is almost always easier to use the helper function not1.
  *
- *  \see http://www.sgi.com/tech/stl/unary_negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/unary_negate
  *  \see not1
  */
 template<typename Predicate>
-struct unary_negate 
+struct unary_negate
     : public thrust::unary_function<typename Predicate::argument_type, bool>
 {
   /*! Constructor takes a \p Predicate object to negate.
@@ -1232,6 +1508,7 @@ struct unary_negate
 
   /*! Function call operator. The return value is <tt>!pred(x)</tt>.
    */
+  __thrust_exec_check_disable__
   __host__ __device__
   bool operator()(const typename Predicate::argument_type& x) { return !pred(x); }
 
@@ -1253,7 +1530,7 @@ struct unary_negate
  *  \return A new object, <tt>npred</tt> such that <tt>npred(x)</tt> always returns
  *          the same value as <tt>!pred(x)</tt>.
  *
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptablePredicate.html">Adaptable Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_negate">Adaptable Predicate</a>.
  *
  *  \see unary_negate
  *  \see not2
@@ -1262,7 +1539,7 @@ template<typename Predicate>
   __host__ __device__
   unary_negate<Predicate> not1(const Predicate &pred);
 
-/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary 
+/*! \p binary_negate is a function object adaptor: it is an Adaptable Binary
  *  Predicate that represents the logical negation of some other Adaptable
  *  Binary Predicate. That is: if \c f is an object of class <tt>binary_negate<AdaptablePredicate></tt>,
  *  then there exists an object \c pred of class \c AdaptableBinaryPredicate
@@ -1270,7 +1547,7 @@ template<typename Predicate>
  *  There is rarely any reason to construct a <tt>binary_negate</tt> directly;
  *  it is almost always easier to use the helper function not2.
  *
- *  \see http://www.sgi.com/tech/stl/binary_negate.html
+ *  \see https://en.cppreference.com/w/cpp/utility/functional/binary_negate
  */
 template<typename Predicate>
 struct binary_negate
@@ -1286,10 +1563,11 @@ struct binary_negate
 
   /*! Function call operator. The return value is <tt>!pred(x,y)</tt>.
    */
+  __thrust_exec_check_disable__
   __host__ __device__
   bool operator()(const typename Predicate::first_argument_type& x, const typename Predicate::second_argument_type& y)
-  { 
-      return !pred(x,y); 
+  {
+      return !pred(x,y);
   }
 
   /*! \cond
@@ -1310,7 +1588,7 @@ struct binary_negate
  *  \return A new object, <tt>npred</tt> such that <tt>npred(x,y)</tt> always returns
  *          the same value as <tt>!pred(x,y)</tt>.
  *
- *  \tparam Binary Predicate is a model of <a href="http://www.sgi.com/tech/stl/AdaptableBinaryPredicate.html">Adaptable Binary Predicate</a>.
+ *  \tparam Binary Predicate is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/AdaptableBinaryPredicate">Adaptable Binary Predicate</a>.
  *
  *  \see binary_negate
  *  \see not1
@@ -1329,7 +1607,7 @@ template<typename BinaryPredicate>
  */
 
 
-/*! \namespace placeholders
+/*! \namespace thrust::placeholders
  *  \brief Facilities for constructing simple functions inline.
  *
  *  Objects in the \p thrust::placeholders namespace may be used to create simple arithmetic functions inline
@@ -1353,7 +1631,7 @@ template<typename BinaryPredicate>
  *    x[1] = 2;
  *    x[2] = 3;
  *    x[3] = 4;
- *    
+ *
  *    y[0] = 1;
  *    y[1] = 1;
  *    y[2] = 1;
@@ -1377,92 +1655,52 @@ namespace placeholders
 
 /*! \p thrust::placeholders::_1 is the placeholder for the first function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<0>::type _1;
-#else
-static const thrust::detail::functional::placeholder<0>::type _1;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<0>::type _1;
 
 
 /*! \p thrust::placeholders::_2 is the placeholder for the second function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<1>::type _2;
-#else
-static const thrust::detail::functional::placeholder<1>::type _2;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<1>::type _2;
 
 
 /*! \p thrust::placeholders::_3 is the placeholder for the third function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<2>::type _3;
-#else
-static const thrust::detail::functional::placeholder<2>::type _3;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<2>::type _3;
 
 
 /*! \p thrust::placeholders::_4 is the placeholder for the fourth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<3>::type _4;
-#else
-static const thrust::detail::functional::placeholder<3>::type _4;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<3>::type _4;
 
 
 /*! \p thrust::placeholders::_5 is the placeholder for the fifth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<4>::type _5;
-#else
-static const thrust::detail::functional::placeholder<4>::type _5;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<4>::type _5;
 
 
 /*! \p thrust::placeholders::_6 is the placeholder for the sixth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<5>::type _6;
-#else
-static const thrust::detail::functional::placeholder<5>::type _6;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<5>::type _6;
 
 
 /*! \p thrust::placeholders::_7 is the placeholder for the seventh function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<6>::type _7;
-#else
-static const thrust::detail::functional::placeholder<6>::type _7;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<6>::type _7;
 
 
 /*! \p thrust::placeholders::_8 is the placeholder for the eighth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<7>::type _8;
-#else
-static const thrust::detail::functional::placeholder<7>::type _8;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<7>::type _8;
 
 
 /*! \p thrust::placeholders::_9 is the placeholder for the ninth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<8>::type _9;
-#else
-static const thrust::detail::functional::placeholder<8>::type _9;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<8>::type _9;
 
 
 /*! \p thrust::placeholders::_10 is the placeholder for the tenth function parameter.
  */
-#ifdef __CUDA_ARCH__
-static const __device__ thrust::detail::functional::placeholder<9>::type _10;
-#else
-static const thrust::detail::functional::placeholder<9>::type _10;
-#endif
+THRUST_INLINE_CONSTANT thrust::detail::functional::placeholder<9>::type _10;
 
 
 } // end placeholders
@@ -1471,9 +1709,11 @@ static const thrust::detail::functional::placeholder<9>::type _10;
 /*! \} // placeholder_objects
  */
 
+#undef THRUST_UNARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION
+#undef THRUST_BINARY_FUNCTOR_VOID_SPECIALIZATION_OP
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/functional.inl>
 #include <thrust/detail/functional/operators.h>
-
diff --git a/thrust/future.h b/thrust/future.h
new file mode 100644
index 000000000..d8fb7544b
--- /dev/null
+++ b/thrust/future.h
@@ -0,0 +1,176 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/future.h
+ *  \brief `thrust::future`, an asynchronous value type.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/execution_policy.h>
+#include <thrust/detail/static_assert.h>
+
+#include <utility>
+
+/*
+// #include the host system's pointer.h header.
+#define __THRUST_HOST_SYSTEM_POINTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/pointer.h>
+  #include __THRUST_HOST_SYSTEM_POINTER_HEADER
+#undef __THRUST_HOST_SYSTEM_POINTER_HEADER
+*/
+
+// #include the device system's pointer.h header.
+#define __THRUST_DEVICE_SYSTEM_POINTER_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/pointer.h>
+  #include __THRUST_DEVICE_SYSTEM_POINTER_HEADER
+#undef __THRUST_DEVICE_SYSTEM_POINTER_HEADER
+
+/*
+// #include the host system's future.h header.
+#define __THRUST_HOST_SYSTEM_FUTURE_HEADER <__THRUST_HOST_SYSTEM_ROOT/future.h>
+  #include __THRUST_HOST_SYSTEM_FUTURE_HEADER
+#undef __THRUST_HOST_SYSTEM_FUTURE_HEADER
+*/
+
+// #include the device system's future.h header.
+#define __THRUST_DEVICE_SYSTEM_FUTURE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/future.h>
+  #include __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_FUTURE_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+///////////////////////////////////////////////////////////////////////////////
+
+// `select_unique_(future|event)_type` is a hook for choosing the
+// `unique_eager_event`/`unique_eager_future` type for a system. `decltype` is
+// used to determine the return type of an ADL call to
+// `select_unique_eager_(future|event)_type(system)`; that return type should
+// be the correct event/future type for `system`. Overloads should only be
+// declared, not defined.
+
+namespace unimplemented
+{
+
+struct no_unique_eager_event_type_found {};
+
+inline __host__ 
+no_unique_eager_event_type_found
+unique_eager_event_type(...) noexcept;
+
+struct no_unique_eager_future_type_found {};
+
+template <typename T>
+__host__ 
+no_unique_eager_future_type_found
+unique_eager_future_type(...) noexcept;
+
+} // namespace unimplemented
+
+namespace unique_eager_event_type_detail
+{
+
+using unimplemented::unique_eager_event_type;
+
+template <typename System>
+using select = decltype(
+  unique_eager_event_type(std::declval<System>())
+);
+
+} // namespace unique_eager_event_type_detail
+
+namespace unique_eager_future_type_detail
+{
+
+using unimplemented::unique_eager_future_type;
+
+template <typename System, typename T>
+using select = decltype(
+  unique_eager_future_type<T>(std::declval<System>())
+);
+
+} // namespace unique_eager_future_type_detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename System>
+using unique_eager_event = unique_eager_event_type_detail::select<System>;
+
+template <typename System>
+using event = unique_eager_event<System>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename System, typename T>
+using unique_eager_future = unique_eager_future_type_detail::select<System, T>;
+
+template <typename System, typename T>
+using future = unique_eager_future<System, T>;
+
+/*
+///////////////////////////////////////////////////////////////////////////////
+
+using host_unique_eager_event = unique_eager_event_type_detail::select<
+  thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag
+>;
+using host_event = host_unique_eager_event;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+using host_unique_eager_future = unique_eager_future_type_detail::select<
+  thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag, T
+>;
+template <typename T>
+using host_future = host_unique_eager_future<T>;
+*/
+
+///////////////////////////////////////////////////////////////////////////////
+
+using device_unique_eager_event = unique_eager_event_type_detail::select<
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag
+>;
+
+using device_event = device_unique_eager_event;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+using device_unique_eager_future = unique_eager_future_type_detail::select<
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag, T
+>;
+
+template <typename T>
+using device_future = device_unique_eager_future<T>;
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct new_stream_t final {};
+
+THRUST_INLINE_CONSTANT new_stream_t new_stream{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::when_all;
+
+///////////////////////////////////////////////////////////////////////////////
+
+THRUST_NAMESPACE_END
+
+#endif
diff --git a/thrust/gather.h b/thrust/gather.h
index 276650a6c..41acc22a3 100644
--- a/thrust/gather.h
+++ b/thrust/gather.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup gathering
  *  \ingroup copying
@@ -48,11 +46,12 @@ namespace thrust
  *  \param result Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather is the inverse of thrust::scatter.
  *
@@ -103,11 +102,12 @@ __host__ __device__
  *  \param input_first Beginning of the source range.
  *  \param result Beginning of the destination range.
  *
- *  \tparam InputIterator must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather is the inverse of thrust::scatter.
  *
@@ -159,13 +159,14 @@ template<typename InputIterator,
  *  \param result Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -225,13 +226,14 @@ __host__ __device__
  *  \param input_first Beginning of the source range.
  *  \param result Beginning of the destination range.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -291,14 +293,15 @@ template<typename InputIterator1,
  *  \param pred Predicate to apply to the stencil values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -371,14 +374,15 @@ __host__ __device__
  *  \param result Beginning of the destination range.
  *  \param pred Predicate to apply to the stencil values.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator must be a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a> and \c RandomAccessIterator's \c value_type must be convertible to \c OutputIterator's \c value_type.
+ *  \tparam OutputIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[map_first, map_last)</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (map_last - map_first))</tt> shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
+ *  \pre The input data shall not overlap the range <tt>[result, result + (map_last - map_first))</tt>.
  *
  *  \remark \p gather_if is the inverse of \p scatter_if.
  *
@@ -435,7 +439,7 @@ template<typename InputIterator1,
 /*! \} // gathering
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/gather.inl>
 
diff --git a/thrust/generate.h b/thrust/generate.h
index a651dd0dc..d47295344 100644
--- a/thrust/generate.h
+++ b/thrust/generate.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -45,9 +43,9 @@ namespace thrust
  *             elements in the range <tt>[first,last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -67,7 +65,7 @@ namespace thrust
  *  \endcode
  *
  *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename DerivedPolicy,
          typename ForwardIterator,
@@ -87,9 +85,9 @@ __host__ __device__
  *  \param gen A function argument, taking no parameters, used to generate values to assign to
  *             elements in the range <tt>[first,last)</tt>.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -109,7 +107,7 @@ __host__ __device__
  *  \endcode
  *
  *  \see generate_n
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename ForwardIterator,
          typename Generator>
@@ -130,9 +128,9 @@ template<typename ForwardIterator,
  *             elements in the range <tt>[first,first + n)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -152,7 +150,7 @@ template<typename ForwardIterator,
  *  \endcode
  *
  *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename DerivedPolicy,
          typename OutputIterator,
@@ -173,9 +171,9 @@ __host__ __device__
  *  \param gen A function argument, taking no parameters, used to generate values to assign to
  *             elements in the range <tt>[first,first + n)</tt>.
  *
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *  \tparam Size is an integral type (either signed or unsigned).
- *  \tparam Generator is a model of <a href="http://www.sgi.com/tech/stl/Generator.html">Generator</a>,
+ *  \tparam Generator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional">Generator</a>,
  *          and \p Generator's \c result_type is convertible to a type in \p OutputIterator's set of \c value_types.
  *
  *  The following code snippet demonstrates how to fill a \c host_vector with random numbers,
@@ -194,7 +192,7 @@ __host__ __device__
  *  \endcode
  *
  *  \see generate
- *  \see http://www.sgi.com/tech/stl/generate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/generate
  */
 template<typename OutputIterator,
          typename Size,
@@ -207,7 +205,7 @@ template<typename OutputIterator,
 /*! \} // end transformations
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/generate.inl>
 
diff --git a/thrust/host_vector.h b/thrust/host_vector.h
index 870b0a7a5..bb925ea9c 100644
--- a/thrust/host_vector.h
+++ b/thrust/host_vector.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,21 +16,21 @@
 
 
 /*! \file host_vector.h
- *  \brief A dynamically-sizable array of elements which reside in the "host" memory space
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to hosts.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <memory>
+#include <thrust/detail/memory_wrapper.h>
 #include <thrust/detail/vector_base.h>
-#include <vector>
 
-namespace thrust
-{
+#include <initializer_list>
+#include <vector>
+#include <utility>
 
-// forward declaration of device_vector
-template<typename T, typename Alloc> class device_vector;
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup container_classes Container Classes
  *  \addtogroup host_containers Host Containers
@@ -42,11 +42,12 @@ template<typename T, typename Alloc> class device_vector;
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p host_vector may vary dynamically; memory management is
- *  automatic. The memory associated with a \p host_vector resides in the memory
- *  space of the host associated with a parallel device.
+ *  automatic. The memory associated with a \p host_vector resides in memory
+ *  accessible to hosts.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see device_vector
+ *  \see universal_vector
  */
 template<typename T, typename Alloc = std::allocator<T> >
   class host_vector
@@ -69,6 +70,13 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector(void)
       :Parent() {}
 
+    /*! This constructor creates an empty \p host_vector.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    host_vector(const Alloc &alloc)
+      :Parent(alloc) {}
+
     /*! The destructor erases the elements.
      */
     //  Define an empty destructor to explicitly specify
@@ -84,6 +92,15 @@ template<typename T, typename Alloc = std::allocator<T> >
     explicit host_vector(size_type n)
       :Parent(n) {}
 
+    /*! This constructor creates a \p host_vector with the given
+     *  size.
+     *  \param n The number of elements to initially create.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    explicit host_vector(size_type n, const Alloc &alloc)
+      :Parent(n,alloc) {}
+
     /*! This constructor creates a \p host_vector with copies
      *  of an exemplar element.
      *  \param n The number of elements to initially create.
@@ -93,6 +110,16 @@ template<typename T, typename Alloc = std::allocator<T> >
     explicit host_vector(size_type n, const value_type &value)
       :Parent(n,value) {}
 
+    /*! This constructor creates a \p host_vector with copies
+     *  of an exemplar element.
+     *  \param n The number of elements to initially create.
+     *  \param value An element to copy.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    __host__
+    explicit host_vector(size_type n, const value_type &value, const Alloc &alloc)
+      :Parent(n,value,alloc) {}
+
     /*! Copy constructor copies from an exemplar \p host_vector.
      *  \param v The \p host_vector to copy.
      */
@@ -100,12 +127,46 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector(const host_vector &v)
       :Parent(v) {}
 
-    /*! Assign operator copies from an exemplar \p host_vector.
+    /*! Copy constructor copies from an exemplar \p host_vector.
      *  \param v The \p host_vector to copy.
+     *  \param alloc The allocator to use by this host_vector.
      */
     __host__
-    host_vector &operator=(const host_vector &v)
-    { Parent::operator=(v); return *this; }
+    host_vector(const host_vector &v, const Alloc &alloc)
+      :Parent(v,alloc) {}
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move constructor moves from another host_vector.
+     *  \param v The host_vector to move.
+     */
+     __host__
+    host_vector(host_vector &&v)
+      :Parent(std::move(v)) {}
+
+    /*! Move constructor moves from another host_vector.
+     *  \param v The host_vector to move.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+     __host__
+    host_vector(host_vector &&v, const Alloc &alloc)
+      :Parent(std::move(v),alloc) {}
+  #endif
+
+  /*! Assign operator copies from an exemplar \p host_vector.
+   *  \param v The \p host_vector to copy.
+   */
+  __host__
+  host_vector &operator=(const host_vector &v)
+  { Parent::operator=(v); return *this; }
+
+  #if THRUST_CPP_DIALECT >= 2011
+    /*! Move assign operator moves from another host_vector.
+     *  \param v The host_vector to move.
+     */
+     __host__
+     host_vector &operator=(host_vector &&v)
+     { Parent::operator=(std::move(v)); return *this; }
+  #endif
 
     /*! Copy constructor copies from an exemplar \p host_vector with different type.
      *  \param v The \p host_vector to copy.
@@ -139,20 +200,43 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector &operator=(const std::vector<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this;}
 
-    /*! Copy constructor copies from an exemplar \p device_vector with possibly different type.
-     *  \param v The \p device_vector to copy.
+    /*! Copy construct from a \p vector_base whose element type is convertible
+     *  to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
     __host__
-    host_vector(const device_vector<OtherT,OtherAlloc> &v);
+    host_vector(const detail::vector_base<OtherT,OtherAlloc> &v)
+      :Parent(v) {}
 
-    /*! Assign operator copies from an exemplar \p device_vector.
-     *  \param v The \p device_vector to copy.
+    /*! Assign a \p vector_base whose element type is convertible to \c T.
+     *
+     *  \param v The \p vector_base to copy.
      */
     template<typename OtherT, typename OtherAlloc>
     __host__
-    host_vector &operator=(const device_vector<OtherT,OtherAlloc> &v)
+    host_vector &operator=(const detail::vector_base<OtherT,OtherAlloc> &v)
     { Parent::operator=(v); return *this; }
+    
+    /*! This constructor builds a \p host_vector from an intializer_list.
+     *  \param il The intializer_list.
+     */
+    host_vector(std::initializer_list<T> il)
+      :Parent(il) {}
+      
+    /*! This constructor builds a \p host_vector from an intializer_list.
+     *  \param il The intializer_list.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    host_vector(std::initializer_list<T> il, const Alloc &alloc)
+      :Parent(il, alloc) {}
+      
+    /*! Assign an \p intializer_list with a matching element type
+     *  \param il The intializer_list.
+     */
+    host_vector &operator=(std::initializer_list<T> il)
+    { Parent::operator=(il); return *this; }
 
     /*! This constructor builds a \p host_vector from a range.
      *  \param first The beginning of the range.
@@ -163,6 +247,16 @@ template<typename T, typename Alloc = std::allocator<T> >
     host_vector(InputIterator first, InputIterator last)
       :Parent(first, last) {}
 
+    /*! This constructor builds a \p host_vector from a range.
+     *  \param first The beginning of the range.
+     *  \param last The end of the range.
+     *  \param alloc The allocator to use by this host_vector.
+     */
+    template<typename InputIterator>
+    __host__
+    host_vector(InputIterator first, InputIterator last, const Alloc &alloc)
+      :Parent(first, last, alloc) {}
+
 // declare these members for the purpose of Doxygenating them
 // they actually exist in a derived-from class
 #if 0
@@ -353,7 +447,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      */
     void pop_back(void);
 
-    /*! This method swaps the contents of this vector_base with another vector.
+    /*! This method swaps the contents of this host_vector with another vector.
      *  \param v The vector with which to swap.
      */
     void swap(host_vector &v);
@@ -379,7 +473,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param x The exemplar element to copy & insert.
      *  \return An iterator pointing to the newly inserted element.
      */
-    iterator insert(iterator position, const T &x); 
+    iterator insert(iterator position, const T &x);
 
     /*! This method inserts a copy of an exemplar value to a range at the
      *  specified position in this vector.
@@ -395,8 +489,8 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html>Input Iterator</a>,
-     *                        and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator>Input Iterator</a>,
+     *                        and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
      */
     template<typename InputIterator>
     void insert(iterator position, InputIterator first, InputIterator last);
@@ -412,7 +506,7 @@ template<typename T, typename Alloc = std::allocator<T> >
      *  \param first The beginning of the range to copy.
      *  \param last  The end of the range to copy.
      *
-     *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator">Input Iterator</a>.
+     *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/named_req/InputIterator">Input Iterator</a>.
      */
     template<typename InputIterator>
     void assign(InputIterator first, InputIterator last);
@@ -422,12 +516,19 @@ template<typename T, typename Alloc = std::allocator<T> >
      */
     allocator_type get_allocator(void) const;
 #endif // end doxygen-only members
-}; // end host_vector
+};
 
-/*! \}
+/*! Exchanges the values of two vectors.
+ *  \p x The first \p host_vector of interest.
+ *  \p y The second \p host_vector of interest.
  */
+template<typename T, typename Alloc>
+  void swap(host_vector<T,Alloc> &a, host_vector<T,Alloc> &b)
+{
+  a.swap(b);
+}
 
-} // end thrust
-
-#include <thrust/detail/host_vector.inl>
+/*! \}
+ */
 
+THRUST_NAMESPACE_END
diff --git a/thrust/inner_product.h b/thrust/inner_product.h
index 0206eff38..80068cf0c 100644
--- a/thrust/inner_product.h
+++ b/thrust/inner_product.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -53,9 +51,9 @@ namespace thrust
  *          and <tt>[first2, last2)</tt> plus \p init.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
  *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
  *          and is convertible to \p OutputType.
@@ -75,7 +73,7 @@ namespace thrust
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -105,9 +103,9 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  \return The inner product of sequences <tt>[first1, last1)</tt>
  *          and <tt>[first2, last2)</tt> plus \p init.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and if \c x is an object of type \p OutputType, and \c y is an object of \p InputIterator1's \c value_type,
  *          and \c z is an object of \p InputIterator2's \c value_type, then <tt>x + y * z</tt> is defined
  *          and is convertible to \p OutputType.
@@ -126,7 +124,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename InputIterator1, typename InputIterator2, typename OutputType>
 OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
@@ -154,15 +152,15 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction1 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction2 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
  * 
  *  \code
@@ -181,7 +179,7 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -219,15 +217,15 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  \param binary_op2 Generalized multiplication operation.
  *  \return The inner product of sequences <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator1's \c value_type is convertible to \p BinaryFunction2's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *          and \p InputIterator2's \c value_type is convertible to \p BinaryFunction2's \c second_argument_type.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p OutputType is convertible to \p BinaryFunction1's \c first_argument_type.
- *  \tparam BinaryFunction1 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction1 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction1's \c return_type is convertible to \p OutputType.
- *  \tparam BinaryFunction2 is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction2 is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction2's \c return_type is convertible to \p BinaryFunction1's \c second_argument_type.
  * 
  *  \code
@@ -245,7 +243,7 @@ OutputType inner_product(const thrust::detail::execution_policy_base<DerivedPoli
  *  // result == 31.0f
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/inner_product.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/inner_product
  */
 template<typename InputIterator1, typename InputIterator2, typename OutputType,
          typename BinaryFunction1, typename BinaryFunction2>
@@ -258,7 +256,7 @@ OutputType inner_product(InputIterator1 first1, InputIterator1 last1,
  *  \} // end reductions
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/inner_product.inl>
 
diff --git a/thrust/iterator/constant_iterator.h b/thrust/iterator/constant_iterator.h
index 344389c3e..c6eec28e7 100644
--- a/thrust/iterator/constant_iterator.h
+++ b/thrust/iterator/constant_iterator.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/detail/constant_iterator_base.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -71,7 +70,7 @@ namespace thrust
  *  #include <thrust/functional.h>
  *  #include <thrust/device_vector.h>
  *
- *  int main(void)
+ *  int main()
  *  {
  *    thrust::device_vector<int> data(4);
  *    data[0] = 3;
@@ -117,8 +116,8 @@ template<typename Value,
      *  null constructor.
      */
     __host__ __device__
-    constant_iterator(void)
-      : super_t(), m_value(){};
+    constant_iterator()
+      : super_t(), m_value() {}
 
     /*! Copy constructor copies the value of another \p constant_iterator into this
      *  \p constant_iterator.
@@ -173,7 +172,7 @@ template<typename Value,
      *  \return A \c const reference to this \p constant_iterator's constant value.
      */
     __host__ __device__
-    Value const& value(void) const
+    Value const& value() const
     { return m_value; }
 
     /*! \cond
@@ -181,16 +180,16 @@ template<typename Value,
 
   protected:
     __host__ __device__
-    Value const& value_reference(void) const
+    Value const& value_reference() const
     { return m_value; }
 
     __host__ __device__
-    Value & value_reference(void)
+    Value & value_reference()
     { return m_value; }
   
   private: // Core iterator interface
     __host__ __device__
-    reference dereference(void) const
+    reference dereference() const
     {
       return m_value;
     }
@@ -217,11 +216,11 @@ template<typename Value,
  *
  *  \see constant_iterator
  */
-template<typename V, typename I>
+template<typename ValueT, typename IndexT>
 inline __host__ __device__
-constant_iterator<V,I> make_constant_iterator(V x, I i = int())
+constant_iterator<ValueT, IndexT> make_constant_iterator(ValueT x, IndexT i = int())
 {
-  return constant_iterator<V,I>(x, i);
+  return constant_iterator<ValueT, IndexT>(x, i);
 } // end make_constant_iterator()
 
 
@@ -247,5 +246,5 @@ constant_iterator<V> make_constant_iterator(V x)
 /*! \} // end iterators
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/counting_iterator.h b/thrust/iterator/counting_iterator.h
index 791a221bf..f66cb97ef 100644
--- a/thrust/iterator/counting_iterator.h
+++ b/thrust/iterator/counting_iterator.h
@@ -22,7 +22,7 @@
 
 /*
  * Copyright David Abrahams 2003.
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -39,8 +39,7 @@
 // #include the details first
 #include <thrust/iterator/detail/counting_iterator.inl>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -65,14 +64,14 @@ namespace thrust
  *  // create iterators
  *  thrust::counting_iterator<int> first(10);
  *  thrust::counting_iterator<int> last = first + 3;
- *   
+ *
  *  first[0]   // returns 10
  *  first[1]   // returns 11
  *  first[100] // returns 110
- *   
+ *
  *  // sum of [first, last)
  *  thrust::reduce(first, last);   // returns 33 (i.e. 10 + 11 + 12)
- *   
+ *
  *  // initialize vector to [0,1,2,..]
  *  thrust::counting_iterator<int> iter(0);
  *  thrust::device_vector<int> vec(500);
@@ -89,11 +88,11 @@ namespace thrust
  *  #include <thrust/copy.h>
  *  #include <thrust/functional.h>
  *  #include <thrust/device_vector.h>
- *   
- *  int main(void)
+ *
+ *  int main()
  *  {
  *   // this example computes indices for all the nonzero values in a sequence
- *   
+ *
  *   // sequence of zero and nonzero values
  *   thrust::device_vector<int> stencil(8);
  *   stencil[0] = 0;
@@ -104,13 +103,13 @@ namespace thrust
  *   stencil[5] = 1;
  *   stencil[6] = 0;
  *   stencil[7] = 1;
- *   
+ *
  *   // storage for the nonzero indices
  *   thrust::device_vector<int> indices(8);
- *   
+ *
  *   // compute indices of nonzero elements
  *   typedef thrust::device_vector<int>::iterator IndexIterator;
- *   
+ *
  *   // use make_counting_iterator to define the sequence [0, 8)
  *   IndexIterator indices_end = thrust::copy_if(thrust::make_counting_iterator(0),
  *                                               thrust::make_counting_iterator(8),
@@ -118,7 +117,7 @@ namespace thrust
  *                                               indices.begin(),
  *                                               thrust::identity<int>());
  *   // indices now contains [1,2,5,7]
- *   
+ *
  *   return 0;
  *  }
  *  \endcode
@@ -145,11 +144,11 @@ template<typename Incrementable,
     /*! \endcond
      */
 
-    /*! Null constructor initializes this \p counting_iterator's \c Incrementable
-     *  counter using its null constructor.
+    /*! Default constructor initializes this \p counting_iterator's counter to
+     * `Incrementable{}`.
      */
     __host__ __device__
-    counting_iterator(void){};
+    counting_iterator() : super_t(Incrementable{}) {}
 
     /*! Copy constructor copies the value of another \p counting_iterator into a
      *  new \p counting_iterator.
@@ -159,7 +158,7 @@ template<typename Incrementable,
     __host__ __device__
     counting_iterator(counting_iterator const &rhs):super_t(rhs.base()){}
 
-    /*! Copy constructor copies the value of another counting_iterator 
+    /*! Copy constructor copies the value of another counting_iterator
      *  with related System type.
      *
      *  \param rhs The \p counting_iterator to copy.
@@ -175,18 +174,22 @@ template<typename Incrementable,
 
     /*! This \c explicit constructor copies the value of an \c Incrementable
      *  into a new \p counting_iterator's \c Incrementable counter.
-     *  
+     *
      *  \param x The initial value of the new \p counting_iterator's \c Incrementable
      *         counter.
      */
     __host__ __device__
     explicit counting_iterator(Incrementable x):super_t(x){}
 
+#if THRUST_CPP_DIALECT >= 2011
+    counting_iterator & operator=(const counting_iterator &) = default;
+#endif
+
     /*! \cond
      */
   private:
     __host__ __device__
-    reference dereference(void) const
+    reference dereference() const
     {
       return this->base_reference();
     }
@@ -239,5 +242,5 @@ counting_iterator<Incrementable> make_counting_iterator(Incrementable x)
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/any_assign.h b/thrust/iterator/detail/any_assign.h
index 27f438260..87192215c 100644
--- a/thrust/iterator/detail/any_assign.h
+++ b/thrust/iterator/detail/any_assign.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -27,7 +26,7 @@ namespace detail
 // a type which may be assigned any other type
 struct any_assign
 {
-  inline __host__ __device__ any_assign(void)
+  inline __host__ __device__ any_assign()
   {}
 
   template<typename T>
@@ -51,5 +50,5 @@ struct any_assign
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/any_system_tag.h b/thrust/iterator/detail/any_system_tag.h
index c49d88d1f..2c5ce6448 100644
--- a/thrust/iterator/detail/any_system_tag.h
+++ b/thrust/iterator/detail/any_system_tag.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 struct any_system_tag
   : thrust::execution_policy<any_system_tag>
@@ -30,8 +29,5 @@ struct any_system_tag
   template<typename T> operator T () const {return T();}
 };
 
-// TODO remove this in 1.7.0
-typedef THRUST_DEPRECATED any_system_tag any_space_tag;
-
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/constant_iterator_base.h b/thrust/iterator/detail/constant_iterator_base.h
index 6b35a906b..56bb7a5d0 100644
--- a/thrust/iterator/detail/constant_iterator_base.h
+++ b/thrust/iterator/detail/constant_iterator_base.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of constant_iterator
 template<typename,typename,typename> class constant_iterator;
@@ -45,7 +46,7 @@ template<typename Value,
   // the incrementable type is int unless otherwise specified
   typedef typename thrust::detail::ia_dflt_help<
     Incrementable,
-    thrust::detail::identity_<int>
+    thrust::detail::identity_<thrust::detail::intmax_t>
   >::type incrementable;
 
   typedef typename thrust::counting_iterator<
@@ -66,5 +67,5 @@ template<typename Value,
 
 } // end detail
   
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/counting_iterator.inl b/thrust/iterator/detail/counting_iterator.inl
index 6289fee36..ee4a9df15 100644
--- a/thrust/iterator/detail/counting_iterator.inl
+++ b/thrust/iterator/detail/counting_iterator.inl
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/numeric_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <cstddef>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of counting_iterator
 template <typename Incrementable, typename System, typename Traversal, typename Difference>
@@ -69,7 +70,7 @@ template <typename Incrementable, typename System, typename Traversal, typename
   // our implementation departs from Boost's in that counting_iterator::dereference
   // returns a copy of its counter, rather than a reference to it. returning a reference
   // to the internal state of an iterator causes subtle bugs (consider the temporary
-  // iterator created in the expression *(iter + i) ) and has no compelling use case
+  // iterator created in the expression *(iter + i)) and has no compelling use case
   typedef thrust::iterator_adaptor<
     counting_iterator<Incrementable, System, Traversal, Difference>, // self
     Incrementable,                                                  // Base
@@ -137,5 +138,5 @@ template<typename Difference, typename Incrementable1, typename Incrementable2>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/device_system_tag.h b/thrust/iterator/detail/device_system_tag.h
index 394b991cd..b86109d21 100644
--- a/thrust/iterator/detail/device_system_tag.h
+++ b/thrust/iterator/detail/device_system_tag.h
@@ -23,18 +23,8 @@
 #include __THRUST_DEVICE_SYSTEM_TAG_HEADER
 #undef __THRUST_DEVICE_SYSTEM_TAG_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::tag device_system_tag;
 
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED device_system_tag device_space_tag;
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/iterator/detail/discard_iterator_base.h b/thrust/iterator/detail/discard_iterator_base.h
index a4a8c312b..38f77b378 100644
--- a/thrust/iterator/detail/discard_iterator_base.h
+++ b/thrust/iterator/detail/discard_iterator_base.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/any_assign.h>
 #include <cstddef> // for std::ptrdiff_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of discard_iterator
 template<typename> class discard_iterator;
@@ -60,6 +59,6 @@ template<typename System>
 
 } // end detail
   
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/detail/distance_from_result.h b/thrust/iterator/detail/distance_from_result.h
index 2b7e0d60e..fe140344d 100644
--- a/thrust/iterator/detail/distance_from_result.h
+++ b/thrust/iterator/detail/distance_from_result.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -38,5 +37,5 @@ template<typename IteratorFacade1, typename IteratorFacade2>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/host_system_tag.h b/thrust/iterator/detail/host_system_tag.h
index a487e6ac5..58478f8d9 100644
--- a/thrust/iterator/detail/host_system_tag.h
+++ b/thrust/iterator/detail/host_system_tag.h
@@ -23,18 +23,8 @@
 #include __THRUST_HOST_SYSTEM_TAG_HEADER
 #undef __THRUST_HOST_SYSTEM_TAG_HEADER
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::tag host_system_tag;
 
-} // end thrust
-
-// TODO remove this in 1.8.0
-namespace thrust
-{
-
-typedef THRUST_DEPRECATED host_system_tag host_space_tag;
-
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/iterator/detail/is_iterator_category.h b/thrust/iterator/detail/is_iterator_category.h
index b538358be..e520452a3 100644
--- a/thrust/iterator/detail/is_iterator_category.h
+++ b/thrust/iterator/detail/is_iterator_category.h
@@ -20,8 +20,7 @@
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -56,5 +55,5 @@ template <typename T>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/is_trivial_iterator.h b/thrust/iterator/detail/is_trivial_iterator.h
deleted file mode 100644
index 1e2ab32a3..000000000
--- a/thrust/iterator/detail/is_trivial_iterator.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits/pointer_traits.h>
-
-#if __GNUC__
-// forward declaration of gnu's __normal_iterator
-namespace __gnu_cxx
-{
-
-template<typename Iterator, typename Container> class __normal_iterator;
-
-} // end __gnu_cxx
-#endif // __GNUC__
-
-#if _MSC_VER
-// forward declaration of MSVC's "normal iterators"
-namespace std
-{
-
-template<typename Value, typename Difference, typename Pointer, typename Reference> struct _Ranit;
-
-} // end std
-#endif // _MSC_VER
-
-namespace thrust
-{
-namespace detail
-{
-
-#ifdef __GNUC__
-template<typename T>
-  struct is_gnu_normal_iterator
-    : false_type
-{};
-
-
-// catch gnu __normal_iterators
-template<typename Iterator, typename Container>
-  struct is_gnu_normal_iterator< __gnu_cxx::__normal_iterator<Iterator, Container> >
-    : true_type
-{};
-#endif // __GNUC__
-
-
-#ifdef _MSC_VER
-// catch msvc _Ranit
-template<typename Iterator>
-  struct is_convertible_to_msvc_Ranit :
-    is_convertible<
-      Iterator,
-      std::_Ranit<
-        typename iterator_value<Iterator>::type,
-        typename iterator_difference<Iterator>::type,
-        typename iterator_pointer<Iterator>::type,
-        typename iterator_reference<Iterator>::type
-      >
-    >
-{};
-#endif // _MSC_VER
-
-
-template<typename T>
-  struct is_trivial_iterator :
-    integral_constant<
-      bool,
-        is_pointer<T>::value
-      | thrust::detail::is_thrust_pointer<T>::value
-#if __GNUC__
-      | is_gnu_normal_iterator<T>::value
-#endif // __GNUC__
-#ifdef _MSC_VER
-      | is_convertible_to_msvc_Ranit<T>::value
-#endif // _MSC_VER
-    >
-{};
-
-} // end detail
-} // end thrust
-
diff --git a/thrust/iterator/detail/iterator_adaptor_base.h b/thrust/iterator/detail/iterator_adaptor_base.h
index d9dbfaae6..1173e414c 100644
--- a/thrust/iterator/detail/iterator_adaptor_base.h
+++ b/thrust/iterator/detail/iterator_adaptor_base.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/use_default.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 // forward declaration of iterator_adaptor for iterator_adaptor_base below
@@ -107,5 +108,5 @@ template<typename Derived,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_to_system.h b/thrust/iterator/detail/iterator_category_to_system.h
index fd378fae7..e6103b539 100644
--- a/thrust/iterator/detail/iterator_category_to_system.h
+++ b/thrust/iterator/detail/iterator_category_to_system.h
@@ -24,8 +24,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -76,5 +75,5 @@ template<typename CategoryOrTraversal>
 }; // end iterator_category_or_traversal_to_system
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_to_traversal.h b/thrust/iterator/detail/iterator_category_to_traversal.h
index d520e9deb..d8c736c50 100644
--- a/thrust/iterator/detail/iterator_category_to_traversal.h
+++ b/thrust/iterator/detail/iterator_category_to_traversal.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/iterator_category_to_system.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -32,9 +31,6 @@ namespace detail
 template <typename> struct is_iterator_system;
 template <typename> struct is_iterator_traversal;
 
-// make type_traits easy to access
-using namespace thrust::detail;
-
 template <typename Category>
   struct host_system_category_to_traversal
     : eval_if<
@@ -52,7 +48,7 @@ template <typename Category>
               eval_if<
                 is_convertible<Category, output_host_iterator_tag>::value,
                 detail::identity_<incrementable_traversal_tag>,
-                void
+                detail::identity_<void>
               >
             >
           >
@@ -80,7 +76,7 @@ template <typename Category>
               eval_if<
                 is_convertible<Category, output_device_iterator_tag>::value,
                 detail::identity_<incrementable_traversal_tag>,
-                void
+                detail::identity_<void>
               >
             >
           >
@@ -111,7 +107,7 @@ template<typename Category>
           device_system_category_to_traversal<Category>,
 
           // unknown category
-          void
+          detail::identity_<void>
         >
       >
 {};
@@ -130,5 +126,5 @@ template <typename CategoryOrTraversal>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_category_with_system_and_traversal.h b/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
index 8f5374b16..cdd8a6d36 100644
--- a/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
+++ b/thrust/iterator/detail/iterator_category_with_system_and_traversal.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -53,5 +52,5 @@ template<typename Category, typename System, typename Traversal>
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_facade_category.h b/thrust/iterator/detail/iterator_facade_category.h
index e00d3ef05..81b518002 100644
--- a/thrust/iterator/detail/iterator_facade_category.h
+++ b/thrust/iterator/detail/iterator_facade_category.h
@@ -27,8 +27,7 @@
 #include <thrust/iterator/detail/iterator_category_with_system_and_traversal.h>
 #include <thrust/iterator/detail/iterator_category_to_traversal.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
@@ -249,5 +248,5 @@ template<typename CategoryOrSystem,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_traits.inl b/thrust/iterator/detail/iterator_traits.inl
index 3076ad8e6..544c24f0b 100644
--- a/thrust/iterator/detail/iterator_traits.inl
+++ b/thrust/iterator/detail/iterator_traits.inl
@@ -14,17 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-/*! \file iterator_traits.inl
- *  \brief Inline file for iterator_traits.h.
- */
+#include <thrust/detail/config.h>
 
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/iterator/detail/iterator_category_to_traversal.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/void_t.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename Iterator>
   struct iterator_value
@@ -32,6 +31,8 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::value_type type;
 }; // end iterator_value
 
+template <typename Iterator>
+using iterator_value_t = typename iterator_value<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_pointer
@@ -39,6 +40,8 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::pointer type;
 }; // end iterator_pointer
 
+template <typename Iterator>
+using iterator_pointer_t = typename iterator_pointer<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_reference
@@ -46,6 +49,8 @@ template<typename Iterator>
   typedef typename iterator_traits<Iterator>::reference type;
 }; // end iterator_reference
 
+template <typename Iterator>
+using iterator_reference_t = typename iterator_reference<Iterator>::type;
 
 template<typename Iterator>
   struct iterator_difference
@@ -53,14 +58,31 @@ template<typename Iterator>
   typedef typename thrust::iterator_traits<Iterator>::difference_type type;
 }; // end iterator_difference
 
+template <typename Iterator>
+using iterator_difference_t = typename iterator_difference<Iterator>::type;
 
-template<typename Iterator>
-  struct iterator_system
-    : detail::iterator_category_to_system<
-        typename thrust::iterator_traits<Iterator>::iterator_category
-      >
+namespace detail
 {
-}; // end iterator_system
+
+template <typename Iterator, typename = void>
+struct iterator_system_impl {};
+
+template <typename Iterator>
+struct iterator_system_impl<
+  Iterator
+, typename voider<
+    typename iterator_traits<Iterator>::iterator_category
+  >::type
+>
+  : detail::iterator_category_to_system<
+      typename iterator_traits<Iterator>::iterator_category
+    >
+{};
+
+} // namespace detail
+
+template <typename Iterator>
+struct iterator_system : detail::iterator_system_impl<Iterator> {};
 
 // specialize iterator_system for void *, which has no category
 template<>
@@ -75,6 +97,8 @@ template<>
   typedef thrust::iterator_system<const int*>::type type;
 }; // end iterator_system<void*>
 
+template <typename Iterator>
+using iterator_system_t = typename iterator_system<Iterator>::type;
 
 template <typename Iterator>
   struct iterator_traversal
@@ -108,5 +132,5 @@ template<typename T>
 
 
 } // end namespace detail
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/iterator_traversal_tags.h b/thrust/iterator/detail/iterator_traversal_tags.h
index 73cd1f76a..1fbc8a1e4 100644
--- a/thrust/iterator/detail/iterator_traversal_tags.h
+++ b/thrust/iterator/detail/iterator_traversal_tags.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 // define Boost's traversal tags
 struct no_traversal_tag {};
@@ -37,5 +38,5 @@ struct bidirectional_traversal_tag
 struct random_access_traversal_tag
   : bidirectional_traversal_tag {};
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/join_iterator.h b/thrust/iterator/detail/join_iterator.h
index c38828040..83f143dc0 100644
--- a/thrust/iterator/detail/join_iterator.h
+++ b/thrust/iterator/detail/join_iterator.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -100,6 +99,10 @@ class join_iterator
   private:
     friend class thrust::iterator_core_access;
 
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
 
     __host__ __device__
     typename super_t::reference dereference() const
@@ -108,6 +111,8 @@ class join_iterator
       return (i < m_n1) ? m_iter1[i] : static_cast<typename super_t::reference>(m_iter2[i]);
     } // end dereference()
 
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
 
     size_type m_n1;
     RandomAccessIterator1 m_iter1;
@@ -124,5 +129,5 @@ join_iterator<RandomAccessIterator1,RandomAccessIterator2,Size> make_join_iterat
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/minimum_category.h b/thrust/iterator/detail/minimum_category.h
index abb80d8c1..01e7e82c5 100644
--- a/thrust/iterator/detail/minimum_category.h
+++ b/thrust/iterator/detail/minimum_category.h
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits/minimum_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 { 
@@ -47,6 +48,6 @@ template<typename T1,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/detail/minimum_system.h b/thrust/iterator/detail/minimum_system.h
index 45b5a592f..dcb29ccd2 100644
--- a/thrust/iterator/detail/minimum_system.h
+++ b/thrust/iterator/detail/minimum_system.h
@@ -21,8 +21,7 @@
 #include <thrust/detail/type_traits/is_metafunction_defined.h>
 #include <thrust/detail/type_traits/minimum_type.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 { 
 
@@ -78,5 +77,5 @@ template<typename T1,
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/normal_iterator.h b/thrust/iterator/detail/normal_iterator.h
index 56a7fd023..eb5d33604 100644
--- a/thrust/iterator/detail/normal_iterator.h
+++ b/thrust/iterator/detail/normal_iterator.h
@@ -22,12 +22,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
-#include <thrust/iterator/detail/is_trivial_iterator.h>
 #include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -67,10 +68,12 @@ template<typename Pointer>
   return normal_iterator<Pointer>(ptr);
 }
 
+} // end detail
 
-template<typename T> struct is_trivial_iterator< normal_iterator<T> > : public true_type {};
-
+template <typename T>
+struct proclaim_contiguous_iterator<
+  thrust::detail::normal_iterator<T>
+> : true_type {};
 
-} // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/permutation_iterator_base.h b/thrust/iterator/detail/permutation_iterator_base.h
index 2610cfdfa..d586cabb7 100644
--- a/thrust/iterator/detail/permutation_iterator_base.h
+++ b/thrust/iterator/detail/permutation_iterator_base.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/minimum_system.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template<typename,typename> class permutation_iterator;
 
@@ -49,5 +50,5 @@ template<typename ElementIterator,
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/retag.h b/thrust/iterator/detail/retag.h
index a512d3640..d277d8b6f 100644
--- a/thrust/iterator/detail/retag.h
+++ b/thrust/iterator/detail/retag.h
@@ -21,8 +21,7 @@
 #include <thrust/iterator/detail/tagged_iterator.h>
 #include <thrust/detail/pointer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -144,5 +143,5 @@ __host__ __device__
 } // end retag()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/reverse_iterator.inl b/thrust/iterator/detail/reverse_iterator.inl
index 21c4712bc..9182ac3e8 100644
--- a/thrust/iterator/detail/reverse_iterator.inl
+++ b/thrust/iterator/detail/reverse_iterator.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,18 +14,21 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace detail
 {
 
 __thrust_exec_check_disable__
 template<typename Iterator>
-__host__ __device__
+  __host__ __device__
   Iterator prior(Iterator x)
 {
   return --x;
@@ -34,6 +37,7 @@ __host__ __device__
 } // end detail
 
 template<typename BidirectionalIterator>
+  __host__ __device__
   reverse_iterator<BidirectionalIterator>
     ::reverse_iterator(BidirectionalIterator x)
       :super_t(x)
@@ -42,45 +46,50 @@ template<typename BidirectionalIterator>
 
 template<typename BidirectionalIterator>
   template<typename OtherBidirectionalIterator>
+    __host__ __device__
     reverse_iterator<BidirectionalIterator>
       ::reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
 // XXX msvc screws this up
-#ifndef _MSC_VER
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
                      , typename thrust::detail::enable_if<
                          thrust::detail::is_convertible<
                            OtherBidirectionalIterator,
                            BidirectionalIterator
                          >::value
                        >::type *
-#endif // _MSC_VER
+#endif // MSVC
                      )
         :super_t(r.base())
 {
 } // end reverse_iterator::reverse_iterator()
 
 template<typename BidirectionalIterator>
+  __host__ __device__
   typename reverse_iterator<BidirectionalIterator>::super_t::reference
     reverse_iterator<BidirectionalIterator>
-      ::dereference(void) const
+      ::dereference() const
 {
   return *thrust::detail::prior(this->base());
 } // end reverse_iterator::increment()
 
 template<typename BidirectionalIterator>
+  __host__ __device__
   void reverse_iterator<BidirectionalIterator>
-    ::increment(void)
+    ::increment()
 {
   --this->base_reference();
 } // end reverse_iterator::increment()
 
 template<typename BidirectionalIterator>
+  __host__ __device__
   void reverse_iterator<BidirectionalIterator>
-    ::decrement(void)
+    ::decrement()
 {
   ++this->base_reference();
 } // end reverse_iterator::decrement()
 
 template<typename BidirectionalIterator>
+  __host__ __device__
   void reverse_iterator<BidirectionalIterator>
     ::advance(typename super_t::difference_type n)
 {
@@ -89,6 +98,7 @@ template<typename BidirectionalIterator>
 
 template<typename BidirectionalIterator>
   template<typename OtherBidirectionalIterator>
+    __host__ __device__
     typename reverse_iterator<BidirectionalIterator>::super_t::difference_type
       reverse_iterator<BidirectionalIterator>
         ::distance_to(reverse_iterator<OtherBidirectionalIterator> const &y) const
@@ -104,5 +114,5 @@ reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalItera
 } // end make_reverse_iterator()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/reverse_iterator_base.h b/thrust/iterator/detail/reverse_iterator_base.h
index 68fa1f2f8..de3bafde9 100644
--- a/thrust/iterator/detail/reverse_iterator_base.h
+++ b/thrust/iterator/detail/reverse_iterator_base.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <typename> class reverse_iterator;
 
@@ -38,5 +39,5 @@ template<typename BidirectionalIterator>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/tagged_iterator.h b/thrust/iterator/detail/tagged_iterator.h
index b7f6fa32b..24cbbb736 100644
--- a/thrust/iterator/detail/tagged_iterator.h
+++ b/thrust/iterator/detail/tagged_iterator.h
@@ -20,9 +20,9 @@
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/use_default.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -51,24 +51,35 @@ template<typename Iterator, typename Tag>
 
   public:
     __host__ __device__
-    tagged_iterator(void) {}
+    tagged_iterator() {}
 
     __host__ __device__
     explicit tagged_iterator(Iterator x)
       : super_t(x) {}
 }; // end tagged_iterator
 
+/*! \p make_tagged_iterator creates a \p tagged_iterator
+ *  from a \c Iterator with system tag \c Tag.
+ *
+ *  \tparam Tag Any system tag.
+ *  \tparam Iterator Any iterator type.
+ *  \param iter The iterator of interest.
+ *  \return An iterator whose system tag is \p Tag and whose behavior is otherwise
+ *          equivalent to \p iter.
+ */
+template <typename Tag, typename Iterator>
+inline auto make_tagged_iterator(Iterator iter) -> tagged_iterator<Iterator, Tag>
+{
+  return tagged_iterator<Iterator, Tag>(iter);
+}
 
-// specialize is_trivial_iterator for tagged_iterator
-template<typename> struct is_trivial_iterator;
-
-// tagged_iterator is trivial if its base iterator is
-template<typename BaseIterator, typename Tag>
-  struct is_trivial_iterator<tagged_iterator<BaseIterator,Tag> >
-    : is_trivial_iterator<BaseIterator>
-{};
+} // end detail
 
+// tagged_iterator is trivial if its base iterator is.
+template <typename BaseIterator, typename Tag>
+struct proclaim_contiguous_iterator<
+  detail::tagged_iterator<BaseIterator, Tag>
+> : is_contiguous_iterator<BaseIterator> {};
 
-} // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/transform_input_output_iterator.inl b/thrust/iterator/detail/transform_input_output_iterator.inl
new file mode 100644
index 000000000..b4792f724
--- /dev/null
+++ b/thrust/iterator/detail/transform_input_output_iterator.inl
@@ -0,0 +1,107 @@
+/*
+ *  Copyright 2020-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_adaptor.h>
+#include <thrust/detail/type_traits.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator;
+
+namespace detail
+{
+
+// Proxy reference that invokes InputFunction when reading from and
+// OutputFunction when writing to the dereferenced iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator_proxy
+{
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+  using Value = invoke_result_t<InputFunction, iterator_value_type>;
+
+  public:
+    __host__ __device__
+    transform_input_output_iterator_proxy(const Iterator& io, InputFunction input_function, OutputFunction output_function)
+      : io(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    transform_input_output_iterator_proxy(const transform_input_output_iterator_proxy&) = default;
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    operator Value const() const
+    {
+      return input_function(*io);
+    }
+
+    __thrust_exec_check_disable__
+    template <typename T>
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const T& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+    __thrust_exec_check_disable__
+    __host__ __device__
+    transform_input_output_iterator_proxy operator=(const transform_input_output_iterator_proxy& x)
+    {
+      *io = output_function(x);
+      return *this;
+    }
+
+  private:
+    Iterator io;
+    InputFunction input_function;
+    OutputFunction output_function;
+};
+
+// Compute the iterator_adaptor instantiation to be used for transform_input_output_iterator
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct transform_input_output_iterator_base
+{
+private:
+  using iterator_value_type = typename thrust::iterator_value<Iterator>::type;
+
+public:
+    typedef thrust::iterator_adaptor
+    <
+        transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+      , Iterator
+      , detail::invoke_result_t<InputFunction, iterator_value_type>
+      , thrust::use_default
+      , thrust::use_default
+      , transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator>
+    > type;
+};
+
+// Register transform_input_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+struct is_proxy_reference<
+    transform_input_output_iterator_proxy<InputFunction, OutputFunction, Iterator> >
+    : public thrust::detail::true_type {};
+
+} // end detail
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/detail/transform_iterator.inl b/thrust/iterator/detail/transform_iterator.inl
index e7eb214e2..0dc6f9854 100644
--- a/thrust/iterator/detail/transform_iterator.inl
+++ b/thrust/iterator/detail/transform_iterator.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,19 +14,22 @@
  *  limitations under the License.
  */
 
-#include <thrust/iterator/transform_iterator.h>
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
 #include <thrust/iterator/iterator_adaptor.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/result_of.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 template <class UnaryFunction, class Iterator, class Reference, class Value>
   class transform_iterator;
-  
-namespace detail 
+
+namespace detail
 {
 
 // Compute the iterator_adaptor instantiation to be used for transform_iterator
@@ -37,25 +40,19 @@ struct transform_iterator_base
     // By default, dereferencing the iterator yields the same as the function.
     typedef typename thrust::detail::ia_dflt_help<
       Reference,
-      thrust::detail::result_of<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
+      thrust::detail::result_of_adaptable_function<UnaryFunc(typename thrust::iterator_value<Iterator>::type)>
     >::type reference;
 
-    // To get the default for Value: remove any reference on the
-    // result type, but retain any constness to signal
-    // non-writability.  Note that if we adopt Thomas' suggestion
-    // to key non-writability *only* on the Reference argument,
-    // we'd need to strip constness here as well.
-    typedef typename thrust::detail::ia_dflt_help<
-      Value,
-      thrust::detail::remove_reference<reference>
-    >::type cv_value_type;
+    // To get the default for Value: remove cvref on the result type.
+    using value_type =
+      typename thrust::detail::ia_dflt_help<Value, thrust::remove_cvref<reference>>::type;
 
- public:
+  public:
     typedef thrust::iterator_adaptor
     <
         transform_iterator<UnaryFunc, Iterator, Reference, Value>
       , Iterator
-      , cv_value_type
+      , value_type
       , thrust::use_default   // Leave the system alone
         //, thrust::use_default   // Leave the traversal alone
         // use the Iterator's category to let any system iterators remain random access even though
@@ -68,5 +65,5 @@ struct transform_iterator_base
 
 
 } // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/transform_output_iterator.inl b/thrust/iterator/detail/transform_output_iterator.inl
new file mode 100644
index 000000000..d5033f105
--- /dev/null
+++ b/thrust/iterator/detail/transform_output_iterator.inl
@@ -0,0 +1,81 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator;
+
+namespace detail
+{
+
+// Proxy reference that uses Unary Function to transform the rhs of assigment
+// operator before writing the result to OutputIterator
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator_proxy
+{
+  public:
+    __host__ __device__
+    transform_output_iterator_proxy(const OutputIterator& out, UnaryFunction fun) : out(out), fun(fun)
+    {
+    }
+
+    __thrust_exec_check_disable__
+    template <typename T>
+    __host__ __device__
+    transform_output_iterator_proxy operator=(const T& x)
+    {
+      *out = fun(x);
+      return *this;
+    }
+
+  private:
+    OutputIterator out;
+    UnaryFunction fun;
+};
+
+// Compute the iterator_adaptor instantiation to be used for transform_output_iterator
+template <typename UnaryFunction, typename OutputIterator>
+struct transform_output_iterator_base
+{
+    typedef thrust::iterator_adaptor
+    <
+        transform_output_iterator<UnaryFunction, OutputIterator>
+      , OutputIterator
+      , thrust::use_default
+      , thrust::use_default
+      , thrust::use_default
+      , transform_output_iterator_proxy<UnaryFunction, OutputIterator>
+    > type;
+};
+
+// Register transform_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class UnaryFunction, class OutputIterator>
+struct is_proxy_reference<
+    transform_output_iterator_proxy<UnaryFunction, OutputIterator> >
+    : public thrust::detail::true_type {};
+
+} // end detail
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/detail/tuple_of_iterator_references.h b/thrust/iterator/detail/tuple_of_iterator_references.h
index 46feccfc0..78c5e8a28 100644
--- a/thrust/iterator/detail/tuple_of_iterator_references.h
+++ b/thrust/iterator/detail/tuple_of_iterator_references.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,23 +21,19 @@
 #include <thrust/pair.h>
 #include <thrust/detail/reference_forward_declaration.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
   
 template<
-  typename T0, typename T1, typename T2,
-  typename T3, typename T4, typename T5,
-  typename T6, typename T7, typename T8,
-  typename T9
+  typename... Ts
 >
   class tuple_of_iterator_references
-    : public thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9>
+    : public thrust::tuple<Ts...>
 {
   private:
-    typedef thrust::tuple<T0,T1,T2,T3,T4,T5,T6,T7,T8,T9> super_t;
+    typedef thrust::tuple<Ts...> super_t;
 
   public:
     // allow implicit construction from tuple<refs>
@@ -48,9 +44,10 @@ template<
 
     // allow assignment from tuples
     // XXX might be worthwhile to guard this with an enable_if is_assignable
-    template<typename U1, typename U2>
+    __thrust_exec_check_disable__
+    template<typename... Us>
     inline __host__ __device__
-    tuple_of_iterator_references &operator=(const detail::cons<U1,U2> &other)
+    tuple_of_iterator_references &operator=(const thrust::tuple<Us...> &other)
     {
       super_t::operator=(other);
       return *this;
@@ -58,6 +55,7 @@ template<
 
     // allow assignment from pairs
     // XXX might be worthwhile to guard this with an enable_if is_assignable
+    __thrust_exec_check_disable__
     template<typename U1, typename U2>
     inline __host__ __device__
     tuple_of_iterator_references &operator=(const thrust::pair<U1,U2> &other)
@@ -69,24 +67,22 @@ template<
     // allow assignment from reference<tuple>
     // XXX perhaps we should generalize to reference<T>
     //     we could captures reference<pair> this way
-    template<typename U0, typename U1, typename U2,
-             typename U3, typename U4, typename U5,
-             typename U6, typename U7, typename U8,
-             typename U9,
-             typename Pointer, typename Derived>
+    __thrust_exec_check_disable__
+    template<typename Pointer, typename Derived,
+             typename... Us>
     inline __host__ __device__
 // XXX gcc-4.2 crashes on is_assignable
 //    typename thrust::detail::enable_if<
 //      thrust::detail::is_assignable<
 //        super_t,
-//        const thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>
+//        const thrust::tuple<Us...>
 //      >::value,
 //      tuple_of_iterator_references &
 //    >::type
     tuple_of_iterator_references &
-    operator=(const thrust::reference<thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9>, Pointer, Derived> &other)
+    operator=(const thrust::reference<thrust::tuple<Us...>, Pointer, Derived> &other)
     {
-      typedef thrust::tuple<U0,U1,U2,U3,U4,U5,U6,U7,U8,U9> tuple_type;
+      typedef thrust::tuple<Us...> tuple_type;
 
       // XXX perhaps this could be accelerated
       tuple_type other_tuple = other;
@@ -99,148 +95,52 @@ template<
     inline __host__ __device__
     tuple_of_iterator_references() {}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0)
-      : super_t(t0,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
+    inline __host__ __device__
+    tuple_of_iterator_references(typename access_traits<Ts>::parameter_type... ts)
+      : super_t(ts...)
     {}
+};
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1)
-      : super_t(t0, t1,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2)
-      : super_t(t0, t1, t2,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
+// this overload of swap() permits swapping tuple_of_iterator_references returned as temporaries from
+// iterator dereferences
+template<
+  typename... Ts,
+  typename... Us
+>
+inline __host__ __device__
+void swap(tuple_of_iterator_references<Ts...> x,
+          tuple_of_iterator_references<Us...> y)
+{
+  x.swap(y);
+}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3)
-      : super_t(t0, t1, t2, t3,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4)
-      : super_t(t0, t1, t2, t3, t4,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
+} // end detail
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5)
-      : super_t(t0, t1, t2, t3, t4, t5,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
+// define tuple_size, tuple_element, etc.
+template<class... Ts>
+struct tuple_size<detail::tuple_of_iterator_references<Ts...>>
+  : std::integral_constant<size_t, sizeof...(Ts)>
+{};
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6)
-      : super_t(t0, t1, t2, t3, t4, t5, t6,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
+template<size_t i>
+struct tuple_element<i, detail::tuple_of_iterator_references<>> {};
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7,
-                static_cast<const null_type&>(null_type()),
-                static_cast<const null_type&>(null_type()))
-    {}
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8,
-                static_cast<const null_type&>(null_type()))
-    {}
+template<class T, class... Ts>
+struct tuple_element<0, detail::tuple_of_iterator_references<T,Ts...>>
+{
+  using type = T;
+};
 
-    inline __host__ __device__ 
-    tuple_of_iterator_references(typename access_traits<T0>::parameter_type t0,
-                                 typename access_traits<T1>::parameter_type t1,
-                                 typename access_traits<T2>::parameter_type t2,
-                                 typename access_traits<T3>::parameter_type t3,
-                                 typename access_traits<T4>::parameter_type t4,
-                                 typename access_traits<T5>::parameter_type t5,
-                                 typename access_traits<T6>::parameter_type t6,
-                                 typename access_traits<T7>::parameter_type t7,
-                                 typename access_traits<T8>::parameter_type t8,
-                                 typename access_traits<T9>::parameter_type t9)
-      : super_t(t0, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-    {}
+
+template<size_t i, class T, class... Ts>
+struct tuple_element<i, detail::tuple_of_iterator_references<T,Ts...>>
+{
+  using type = typename tuple_element<i - 1, detail::tuple_of_iterator_references<Ts...>>::type;
 };
 
 
-} // end detail
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/universal_categories.h b/thrust/iterator/detail/universal_categories.h
index 2389796b1..d2abd7f55 100644
--- a/thrust/iterator/detail/universal_categories.h
+++ b/thrust/iterator/detail/universal_categories.h
@@ -21,8 +21,7 @@
 
 // XXX eliminate this file
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // define these types without inheritance to avoid ambiguous conversion to base classes
 
@@ -83,5 +82,5 @@ struct random_access_universal_iterator_tag
 };
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/zip_iterator.inl b/thrust/iterator/detail/zip_iterator.inl
index d5e65431d..a2bc98afe 100644
--- a/thrust/iterator/detail/zip_iterator.inl
+++ b/thrust/iterator/detail/zip_iterator.inl
@@ -16,17 +16,18 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/tuple_transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 template<typename IteratorTuple>
 __host__ __device__
   zip_iterator<IteratorTuple>
-    ::zip_iterator(void)
+    ::zip_iterator()
 {
 } // end zip_iterator::zip_iterator()
 
@@ -57,7 +58,7 @@ template<typename IteratorTuple>
 template<typename IteratorTuple>
 __host__ __device__
 const IteratorTuple &zip_iterator<IteratorTuple>
-  ::get_iterator_tuple(void) const
+  ::get_iterator_tuple() const
 {
   return m_iterator_tuple;
 } // end zip_iterator::get_iterator_tuple()
@@ -67,11 +68,13 @@ template<typename IteratorTuple>
   typename zip_iterator<IteratorTuple>::super_t::reference
   __host__ __device__
     zip_iterator<IteratorTuple>
-      ::dereference(void) const
+      ::dereference() const
 {
   using namespace detail::tuple_impl_specific;
 
-  return thrust::detail::tuple_host_device_transform<detail::dereference_iterator::template apply>(get_iterator_tuple(), detail::dereference_iterator());
+  return thrust::detail::tuple_host_device_transform<
+    detail::dereference_iterator::template apply
+  >(get_iterator_tuple(), detail::dereference_iterator());
 } // end zip_iterator::dereference()
 
 
@@ -100,7 +103,7 @@ __host__ __device__
 template<typename IteratorTuple>
 __host__ __device__
   void zip_iterator<IteratorTuple>
-    ::increment(void)
+    ::increment()
 {
   using namespace detail::tuple_impl_specific;
   tuple_for_each(m_iterator_tuple, detail::increment_iterator());
@@ -110,7 +113,7 @@ __host__ __device__
 template<typename IteratorTuple>
 __host__ __device__
   void zip_iterator<IteratorTuple>
-    ::decrement(void)
+    ::decrement()
 {
   using namespace detail::tuple_impl_specific;
   tuple_for_each(m_iterator_tuple, detail::decrement_iterator());
@@ -129,13 +132,21 @@ template<typename IteratorTuple>
 } // end zip_iterator::distance_to()
 
 
-template<typename IteratorTuple>
+template<typename... Iterators>
+__host__ __device__
+  zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(thrust::tuple<Iterators...> t)
+{
+  return zip_iterator<thrust::tuple<Iterators...>>(t);
+} // end make_zip_iterator()
+
+
+template<typename... Iterators>
 __host__ __device__
-  zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t)
+  zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(Iterators... its)
 {
-  return zip_iterator<IteratorTuple>(t);
+  return make_zip_iterator(thrust::make_tuple(its...));
 } // end make_zip_iterator()
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/detail/zip_iterator_base.h b/thrust/iterator/detail/zip_iterator_base.h
index e0d941c8f..030153b65 100644
--- a/thrust/iterator/detail/zip_iterator_base.h
+++ b/thrust/iterator/detail/zip_iterator_base.h
@@ -16,19 +16,22 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
+#include <thrust/advance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/iterator/iterator_categories.h>
 #include <thrust/iterator/detail/minimum_category.h>
 #include <thrust/iterator/detail/minimum_system.h>
+#include <thrust/type_traits/integer_sequence.h>
 #include <thrust/tuple.h>
 #include <thrust/detail/tuple_meta_transform.h>
 #include <thrust/detail/tuple_transform.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/detail/tuple_of_iterator_references.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declare zip_iterator for zip_iterator_base
 template<typename IteratorTuple> class zip_iterator;
@@ -45,12 +48,12 @@ class advance_iterator
 public:
   inline __host__ __device__
   advance_iterator(DiffType step) : m_step(step) {}
-  
+
   __thrust_exec_check_disable__
   template<typename Iterator>
   inline __host__ __device__
   void operator()(Iterator& it) const
-  { it += m_step; }
+  { thrust::advance(it, m_step); }
 
 private:
   DiffType m_step;
@@ -127,17 +130,28 @@ template<class Tuple, class BinaryMetaFun, class StartType>
   struct tuple_meta_accumulate;
 
 template<
-    typename Tuple
-  , class BinaryMetaFun
+    class BinaryMetaFun
+  , typename StartType
+>
+  struct tuple_meta_accumulate<thrust::tuple<>,BinaryMetaFun,StartType>
+{
+   typedef typename thrust::detail::identity_<StartType>::type type;
+};
+
+
+template<
+    class BinaryMetaFun
   , typename StartType
+  , typename    T
+  , typename... Ts
 >
-  struct tuple_meta_accumulate_impl
+  struct tuple_meta_accumulate<thrust::tuple<T,Ts...>,BinaryMetaFun,StartType>
 {
    typedef typename apply2<
        BinaryMetaFun
-     , typename Tuple::head_type
+     , T
      , typename tuple_meta_accumulate<
-           typename Tuple::tail_type
+           thrust::tuple<Ts...>
          , BinaryMetaFun
          , StartType 
        >::type
@@ -145,81 +159,40 @@ template<
 };
 
 
-template<
-    typename Tuple
-  , class BinaryMetaFun
-  , typename StartType
->
-struct tuple_meta_accumulate
-  : thrust::detail::eval_if<
-        thrust::detail::is_same<Tuple, thrust::null_type>::value
-      , thrust::detail::identity_<StartType>
-      , tuple_meta_accumulate_impl<
-            Tuple
-          , BinaryMetaFun
-          , StartType
-        >
-    > // end eval_if
+template<typename Fun>
+inline __host__ __device__
+Fun tuple_for_each_helper(Fun f)
 {
-}; // end tuple_meta_accumulate
-
-
-// transform algorithm for tuples. The template parameter Fun
-// must be a unary functor which is also a unary metafunction
-// class that computes its return type based on its argument
-// type. For example:
-//
-// struct to_ptr
-// {
-//     template <class Arg>
-//     struct apply
-//     {
-//          typedef Arg* type;
-//     }
-//
-//     template <class Arg>
-//     Arg* operator()(Arg x);
-// };
-
+  return f;
+}
 
+template<typename Fun, typename T, typename... Ts>
+inline __host__ __device__
+Fun tuple_for_each_helper(Fun f, T& t, Ts&... ts)
+{
+  f(t);
+  return tuple_for_each_helper(f, ts...);
+}
 
 // for_each algorithm for tuples.
-template<typename Fun>
+
+template<typename Fun, typename... Ts, size_t... Is>
 inline __host__ __device__
-Fun tuple_for_each(thrust::null_type, Fun f)
+Fun tuple_for_each(thrust::tuple<Ts...>& t, Fun f, thrust::index_sequence<Is...>)
 {
-  return f;
+  return tuple_for_each_helper(f, thrust::get<Is>(t)...);
 } // end tuple_for_each()
 
-
-template<typename Tuple, typename Fun>
+// for_each algorithm for tuples.
+template<typename Fun, typename... Ts>
 inline __host__ __device__
-Fun tuple_for_each(Tuple& t, Fun f)
+Fun tuple_for_each(thrust::tuple<Ts...>& t, Fun f)
 { 
-  f( t.get_head() );
-  return tuple_for_each(t.get_tail(), f);
-} // end tuple_for_each()
+  return tuple_for_each(t, f, thrust::make_index_sequence<thrust::tuple_size<thrust::tuple<Ts...>>::value>{});
+}
 
 
-// Equality of tuples. NOTE: "==" for tuples currently (7/2003)
-// has problems under some compilers, so I just do my own.
-// No point in bringing in a bunch of #ifdefs here. This is
-// going to go away with the next tuple implementation anyway.
-//
-__host__ __device__
-inline bool tuple_equal(thrust::null_type, thrust::null_type)
-{ return true; }
-
-
-template<typename Tuple1, typename Tuple2>
-__host__ __device__
-bool tuple_equal(Tuple1 const& t1, Tuple2 const& t2)
-{ 
-  return t1.get_head() == t2.get_head() && 
-  tuple_equal(t1.get_tail(), t2.get_tail());
-} // end tuple_equal()
-
-} // end end tuple_impl_specific
+} // end tuple_impl_specific
 
 
 // Metafunction to obtain the type of the tuple whose element types
@@ -293,29 +266,16 @@ namespace zip_iterator_base_ns
 {
 
 
-template<int i, typename Tuple>
-  struct tuple_elements_helper
-    : eval_if<
-        (i < tuple_size<Tuple>::value),
-        tuple_element<i,Tuple>,
-        identity_<thrust::null_type>
-      >
-{};
+template<typename Tuple, typename IndexSequence>
+  struct tuple_of_iterator_references_helper;
 
 
-template<typename Tuple>
-  struct tuple_elements
+template<typename Tuple, size_t... Is>
+  struct tuple_of_iterator_references_helper<Tuple, thrust::index_sequence<Is...>>
 {
-  typedef typename tuple_elements_helper<0,Tuple>::type T0;
-  typedef typename tuple_elements_helper<1,Tuple>::type T1;
-  typedef typename tuple_elements_helper<2,Tuple>::type T2;
-  typedef typename tuple_elements_helper<3,Tuple>::type T3;
-  typedef typename tuple_elements_helper<4,Tuple>::type T4;
-  typedef typename tuple_elements_helper<5,Tuple>::type T5;
-  typedef typename tuple_elements_helper<6,Tuple>::type T6;
-  typedef typename tuple_elements_helper<7,Tuple>::type T7;
-  typedef typename tuple_elements_helper<8,Tuple>::type T8;
-  typedef typename tuple_elements_helper<9,Tuple>::type T9;
+  typedef thrust::detail::tuple_of_iterator_references<
+    typename thrust::tuple_element<Is,Tuple>::type...
+  > type;
 };
 
 
@@ -328,22 +288,11 @@ template<typename IteratorTuple>
     iterator_reference
   >::type tuple_of_references;
 
-  // get at the individual tuple element types by name
-  typedef tuple_elements<tuple_of_references> elements;
-
   // map thrust::tuple<T...> to tuple_of_iterator_references<T...>
-  typedef thrust::detail::tuple_of_iterator_references<
-    typename elements::T0,
-    typename elements::T1,
-    typename elements::T2,
-    typename elements::T3,
-    typename elements::T4,
-    typename elements::T5,
-    typename elements::T6,
-    typename elements::T7,
-    typename elements::T8,
-    typename elements::T9
-  > type;
+  typedef typename tuple_of_iterator_references_helper<
+    tuple_of_references,
+    thrust::make_index_sequence<thrust::tuple_size<tuple_of_references>::value>
+  >::type type;
 };
 
 
@@ -399,6 +348,6 @@ template<typename IteratorTuple>
 
 } // end detail
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/iterator/discard_iterator.h b/thrust/iterator/discard_iterator.h
index 7e7ffc5d4..eb5156eda 100644
--- a/thrust/iterator/discard_iterator.h
+++ b/thrust/iterator/discard_iterator.h
@@ -25,10 +25,9 @@
 #include <thrust/iterator/detail/discard_iterator_base.h>
 #include <thrust/iterator/iterator_facade.h>
 
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -53,7 +52,7 @@ namespace thrust
  *  #include <thrust/reduce.h>
  *  #include <thrust/device_vector.h>
  *
- *  int main(void)
+ *  int main()
  *  {
  *    thrust::device_vector<int> keys(7), values(7);
  *
@@ -81,9 +80,9 @@ namespace thrust
  *                          values.begin(),
  *                          thrust::make_discard_iterator(),
  *                          result.begin());
- *    
+ *
  *    // result is now [9, 21, 9, 3]
- *    
+ *
  *    return 0;
  *  }
  *  \endcode
@@ -116,9 +115,13 @@ template<typename System = use_default>
     discard_iterator(discard_iterator const &rhs)
       : super_t(rhs.base()) {}
 
+#if THRUST_CPP_DIALECT >= 2011
+    discard_iterator & operator=(const discard_iterator &) = default;
+#endif
+
     /*! This constructor receives an optional index specifying the position of this
      *  \p discard_iterator in a range.
-     *  
+     *
      *  \p i The index of this \p discard_iterator in a range. Defaults to the
      *       value returned by \c Incrementable's null constructor. For example,
      *       when <tt>Incrementable == int</tt>, \c 0.
@@ -129,10 +132,10 @@ template<typename System = use_default>
 
     /*! \cond
      */
-  
+
   private: // Core iterator interface
     __host__ __device__
-    reference dereference(void) const
+    reference dereference() const
     {
       return m_element;
     }
@@ -165,7 +168,7 @@ discard_iterator<> make_discard_iterator(discard_iterator<>::difference_type i =
 /*! \} // end iterators
  */
 
-} // end namespace thrust
-  
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
+THRUST_NAMESPACE_END
+
+THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
 
diff --git a/thrust/iterator/iterator_adaptor.h b/thrust/iterator/iterator_adaptor.h
index 6ec58e642..67d4866b9 100644
--- a/thrust/iterator/iterator_adaptor.h
+++ b/thrust/iterator/iterator_adaptor.h
@@ -37,8 +37,7 @@
 #include <thrust/detail/use_default.h>
 #include <thrust/iterator/detail/iterator_adaptor_base.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -144,6 +143,7 @@ template<typename Derived,
 
     /*! This constructor copies from a given instance of the \p Base iterator.
      */
+    __thrust_exec_check_disable__
     __host__ __device__
     explicit iterator_adaptor(Base const& iter)
       : m_iterator(iter)
@@ -200,7 +200,10 @@ template<typename Derived,
     void advance(typename iterator_adaptor::difference_type n)
     {
       // XXX statically assert on random_access_traversal_tag
-      m_iterator += n;
+
+      // counting_iterator will pick eg. diff_t=int64 when base=int32.
+      // Explicitly cast to avoid static conversion warnings.
+      m_iterator = static_cast<base_type>(m_iterator + n);
     }
 
     __thrust_exec_check_disable__
@@ -235,5 +238,5 @@ template<typename Derived,
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/iterator_categories.h b/thrust/iterator/iterator_categories.h
index 02246d446..9a6f3f4ae 100644
--- a/thrust/iterator/iterator_categories.h
+++ b/thrust/iterator/iterator_categories.h
@@ -39,8 +39,7 @@
 // #include this for stl's iterator tags
 #include <iterator>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \addtogroup iterator_tags Iterator Tags
@@ -55,7 +54,7 @@ namespace thrust
  *  representation of the Input Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  output_device_iterator_tag, forward_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -74,7 +73,7 @@ struct input_device_iterator_tag
  *  representation of the Output Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  input_device_iterator_tag, forward_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -93,7 +92,7 @@ struct output_device_iterator_tag
  *  representation of the Forward Device Iterator concept within the C++ type
  *  system.
  *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html, iterator_traits,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags  iterator_traits,
  *  input_device_iterator_tag, output_device_iterator_tag,
  *  bidirectional_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -112,7 +111,7 @@ struct forward_device_iterator_tag
  *  representation of the Bidirectional Device Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, random_access_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -131,7 +130,7 @@ struct bidirectional_device_iterator_tag
  *  representation of the Random Access Device Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  input_host_iterator_tag, output_host_iterator_tag, forward_host_iterator_tag,
@@ -150,7 +149,7 @@ struct random_access_device_iterator_tag
  *  representation of the Input Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/input_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -164,7 +163,7 @@ typedef std::input_iterator_tag input_host_iterator_tag;
  *  representation of the Output Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/output_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -178,7 +177,7 @@ typedef std::output_iterator_tag output_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/forward_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -192,7 +191,7 @@ typedef std::forward_iterator_tag forward_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/bidirectional_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -206,7 +205,7 @@ typedef std::bidirectional_iterator_tag bidirectional_host_iterator_tag;
  *  representation of the Forward Host Iterator concept within the C++
  *  type system.
  *
- *  \see http://www.sgi.com/tech/sgi/random_access_iterator_tag.html,
+ *  \see https://en.cppreference.com/w/cpp/iterator/iterator_tags 
  *  iterator_traits, input_device_iterator_tag, output_device_iterator_tag,
  *  forward_device_iterator_tag, bidirectional_device_iterator_tag,
  *  random_access_device_iterator_tag,
@@ -218,7 +217,7 @@ typedef std::random_access_iterator_tag random_access_host_iterator_tag;
 /*! \} // end iterator_tag_classes
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/universal_categories.h>
 
diff --git a/thrust/iterator/iterator_facade.h b/thrust/iterator/iterator_facade.h
index 86757d712..f6920c5c8 100644
--- a/thrust/iterator/iterator_facade.h
+++ b/thrust/iterator/iterator_facade.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/detail/iterator_facade_category.h>
 #include <thrust/iterator/detail/distance_from_result.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -539,5 +538,5 @@ Derived operator+ (typename Derived::difference_type n,
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/iterator_traits.h b/thrust/iterator/iterator_traits.h
index c0faf371c..b2f4b175a 100644
--- a/thrust/iterator/iterator_traits.h
+++ b/thrust/iterator/iterator_traits.h
@@ -31,44 +31,17 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/type_traits/void_t.h>
+
 #include <iterator>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \p iterator_traits is a type trait class that provides a uniform
  *  interface for querying the properties of iterators at compile-time.
  */
-template<typename T>
-  struct iterator_traits
-{
-  typedef typename T::difference_type difference_type;
-  typedef typename T::value_type value_type;
-  typedef typename T::pointer pointer;
-  typedef typename T::reference reference;
-  typedef typename T::iterator_category iterator_category;
-};
-
-// traits are specialized for pointer types
-template<typename T>
-  struct iterator_traits<T*>
-{
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef T* pointer;
-  typedef T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-};
-
-template<typename T>
-  struct iterator_traits<const T*>
-{
-  typedef std::ptrdiff_t difference_type;
-  typedef T value_type;
-  typedef const T* pointer;
-  typedef const T& reference;
-  typedef std::random_access_iterator_tag iterator_category;
-}; // end iterator_traits
+template <typename T>
+struct iterator_traits : std::iterator_traits<T> {};
 
 template<typename Iterator> struct iterator_value;
 
@@ -82,15 +55,7 @@ template<typename Iterator> struct iterator_traversal;
 
 template<typename Iterator> struct iterator_system;
 
-// TODO remove this in Thrust v1.7.0
-template<typename Iterator>
-  struct THRUST_DEPRECATED iterator_space
-{
-  typedef THRUST_DEPRECATED typename iterator_system<Iterator>::type type;
-};
-
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/iterator_traversal_tags.h>
 #include <thrust/iterator/detail/host_system_tag.h>
diff --git a/thrust/iterator/permutation_iterator.h b/thrust/iterator/permutation_iterator.h
index 27555ddd0..be5010e54 100644
--- a/thrust/iterator/permutation_iterator.h
+++ b/thrust/iterator/permutation_iterator.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 /*! \addtogroup iterators
@@ -75,7 +74,7 @@ namespace thrust
  *  #include <thrust/iterator/permutation_iterator.h>
  *  #include <thrust/device_vector.h>
  *  ...
- *  thrust::device_vector<float> values(4);
+ *  thrust::device_vector<float> values(8);
  *  values[0] = 10.0f;
  *  values[1] = 20.0f;
  *  values[2] = 30.0f;
@@ -167,6 +166,11 @@ template <typename ElementIterator,
   /*! \cond
    */
   private:
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
+
     __thrust_exec_check_disable__
     __host__ __device__
     typename super_t::reference dereference() const
@@ -174,6 +178,8 @@ template <typename ElementIterator,
       return *(m_element_iterator + *this->base());
     }
 
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
     // make friends for the copy constructor
     template<typename,typename> friend class permutation_iterator;
 
@@ -206,5 +212,5 @@ permutation_iterator<ElementIterator,IndexIterator> make_permutation_iterator(El
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/retag.h b/thrust/iterator/retag.h
index 6adf5e244..1eb770ae3 100644
--- a/thrust/iterator/retag.h
+++ b/thrust/iterator/retag.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/detail/retag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 /*! \ingroup iterator_tags
@@ -66,5 +65,5 @@ unspecified_iterator_type retag(Iterator iter);
  */
 
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/reverse_iterator.h b/thrust/iterator/reverse_iterator.h
index 7509d860a..fe8bbe0cf 100644
--- a/thrust/iterator/reverse_iterator.h
+++ b/thrust/iterator/reverse_iterator.h
@@ -37,8 +37,7 @@
 #include <thrust/iterator/detail/reverse_iterator_base.h>
 #include <thrust/iterator/iterator_facade.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -160,7 +159,7 @@ template<typename BidirectionalIterator>
     /*! Default constructor does nothing.
      */
     __host__ __device__
-    reverse_iterator(void) {}
+    reverse_iterator() {}
 
     /*! \p Constructor accepts a \c BidirectionalIterator pointing to a range
      *  for this \p reverse_iterator to reverse.
@@ -180,14 +179,14 @@ template<typename BidirectionalIterator>
     reverse_iterator(reverse_iterator<OtherBidirectionalIterator> const &r
 // XXX msvc screws this up
 // XXX remove these guards when we have static_assert
-#ifndef _MSC_VER
+#if THRUST_HOST_COMPILER != THRUST_HOST_COMPILER_MSVC
                      , typename thrust::detail::enable_if<
                          thrust::detail::is_convertible<
                            OtherBidirectionalIterator,
                            BidirectionalIterator
                          >::value
                        >::type * = 0
-#endif // _MSC_VER
+#endif // MSVC
                      );
 
   /*! \cond
@@ -195,13 +194,13 @@ template<typename BidirectionalIterator>
   private:
     __thrust_exec_check_disable__
     __host__ __device__
-    typename super_t::reference dereference(void) const;
+    typename super_t::reference dereference() const;
 
     __host__ __device__
-    void increment(void);
+    void increment();
 
     __host__ __device__
-    void decrement(void);
+    void decrement();
 
     __host__ __device__
     void advance(typename super_t::difference_type n);
@@ -232,7 +231,7 @@ reverse_iterator<BidirectionalIterator> make_reverse_iterator(BidirectionalItera
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/reverse_iterator.inl>
 
diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h
new file mode 100644
index 000000000..a5f725dc5
--- /dev/null
+++ b/thrust/iterator/transform_input_output_iterator.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright 2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/transform_input_output_iterator.h
+ *  \brief An iterator which adapts another iterator by applying transform
+ *         functions when reading and writing dereferenced values.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/transform_input_output_iterator.inl>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_input_output_iterator is a special kind of iterator which applies
+ * transform functions when reading from or writing to dereferenced values.
+ * This iterator is useful for algorithms that operate on a type that needs to
+ * be serialized/deserialized from values in another iterator, avoiding the
+ * need to materialize intermediate results in memory. This also enables the
+ * transform functions to be fused with the operations that read and write to
+ * the `transform_input_output_iterator`.
+ *
+ * The following code snippet demonstrates how to create a
+ * \p transform_input_output_iterator which performs different transformations when
+ * reading from and writing to the iterator.
+ *
+ * \code
+ * #include <thrust/iterator/transform_input_output_iterator.h>
+ * #include <thrust/device_vector.h>
+ *
+ *  int main()
+ *  {
+ *    const size_t size = 4;
+ *    thrust::device_vector<float> v(size);
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to vector
+ *    thrust::sequence(v.begin(), v.end(), 1);
+ *
+ *    // Iterator that returns negated values and writes squared values
+ *    auto iter = thrust::make_transform_input_output_iterator(v.begin(),
+ *        thrust::negate<float>{}, thrust::square<float>{});
+ * 
+ *    // Iterator negates values when reading
+ *    std::cout << iter[0] << " ";  // -1.0f;
+ *    std::cout << iter[1] << " ";  // -2.0f;
+ *    std::cout << iter[2] << " ";  // -3.0f;
+ *    std::cout << iter[3] << "\n"; // -4.0f;
+ *
+ *    // Write 1.0f, 2.0f, 3.0f, 4.0f to iterator
+ *    thrust::sequence(iter, iter + size, 1);
+ *
+ *    // Values were squared before writing to vector
+ *    std::cout << v[0] << " ";  // 1.0f;
+ *    std::cout << v[1] << " ";  // 4.0f;
+ *    std::cout << v[2] << " ";  // 9.0f;
+ *    std::cout << v[3] << "\n"; // 16.0f;
+ *
+ *  }
+ * \endcode
+ *
+ * \see make_transform_input_output_iterator
+ */
+
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+  class transform_input_output_iterator
+    : public detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+{
+
+  /*! \cond
+   */
+
+  public:
+
+    typedef typename
+    detail::transform_input_output_iterator_base<InputFunction, OutputFunction, Iterator>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  transform_input_output_iterator() = default;
+
+  /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+   * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+   *
+   * \param io An \c Iterator pointing to where the input to \c InputFunction
+   *           will be read from and the result of \c OutputFunction will be written to
+   * \param input_function An \c InputFunction to be executed on values read from the iterator
+   * \param output_function An \c OutputFunction to be executed on values written to the iterator
+   */
+    __host__ __device__
+    transform_input_output_iterator(Iterator const& io, InputFunction input_function, OutputFunction output_function)
+      : super_t(io), input_function(input_function), output_function(output_function)
+    {
+    }
+
+    /*! \cond
+     */
+  private:
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return detail::transform_input_output_iterator_proxy<
+        InputFunction, OutputFunction, Iterator
+      >(this->base_reference(), input_function, output_function);
+    }
+
+    InputFunction input_function;
+    OutputFunction output_function;
+
+    /*! \endcond
+     */
+}; // end transform_input_output_iterator
+
+/*! \p make_transform_input_output_iterator creates a \p transform_input_output_iterator from
+ *  an \c Iterator a \c InputFunction and a \c OutputFunction
+ *
+ * \param io An \c Iterator pointing to where the input to \c InputFunction
+ *           will be read from and the result of \c OutputFunction will be written to
+ * \param input_function An \c InputFunction to be executed on values read from the iterator
+ * \param output_function An \c OutputFunction to be executed on values written to the iterator
+ *  \see transform_input_output_iterator
+ */
+template <typename InputFunction, typename OutputFunction, typename Iterator>
+transform_input_output_iterator<InputFunction, OutputFunction, Iterator>
+__host__ __device__
+make_transform_input_output_iterator(Iterator io, InputFunction input_function, OutputFunction output_function)
+{
+    return transform_input_output_iterator<InputFunction, OutputFunction, Iterator>(io, input_function, output_function);
+} // end make_transform_input_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/transform_iterator.h b/thrust/iterator/transform_iterator.h
index bac004845..5afb5f37b 100644
--- a/thrust/iterator/transform_iterator.h
+++ b/thrust/iterator/transform_iterator.h
@@ -16,14 +16,14 @@
 
 
 /*! \file thrust/iterator/transform_iterator.h
- *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference 
+ *  \brief An iterator which adapts another iterator by applying a function to the result of its dereference
  */
 
 /*
  * (C) Copyright David Abrahams 2002.
  * (C) Copyright Jeremy Siek    2002.
  * (C) Copyright Thomas Witt    2002.
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -40,8 +40,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -53,7 +52,7 @@ namespace thrust
  */
 
 /*! \p transform_iterator is an iterator which represents a pointer into a range
- *  of values after transformation by a function. This iterator is useful for 
+ *  of values after transformation by a function. This iterator is useful for
  *  creating a range filled with the result of applying an operation to another range
  *  without either explicitly storing it in memory, or explicitly executing the transformation.
  *  Using \p transform_iterator facilitates kernel fusion by deferring the execution
@@ -66,7 +65,7 @@ namespace thrust
  *  \code
  *  #include <thrust/iterator/transform_iterator.h>
  *  #include <thrust/device_vector.h>
- *  
+ *
  *  // note: functor inherits from unary_function
  *  struct square_root : public thrust::unary_function<float,float>
  *  {
@@ -76,25 +75,25 @@ namespace thrust
  *      return sqrtf(x);
  *    }
  *  };
- *  
- *  int main(void)
+ *
+ *  int main()
  *  {
  *    thrust::device_vector<float> v(4);
  *    v[0] = 1.0f;
  *    v[1] = 4.0f;
  *    v[2] = 9.0f;
  *    v[3] = 16.0f;
- *                                                                                           
+ *
  *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *                                                                                           
+ *
  *    thrust::transform_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
- *                                                                                           
+ *
  *    *iter;   // returns 1.0f
  *    iter[0]; // returns 1.0f;
  *    iter[1]; // returns 2.0f;
  *    iter[2]; // returns 3.0f;
  *    iter[3]; // returns 4.0f;
- *                                                                                           
+ *
  *    // iter[4] is an out-of-bounds error
  *  }
  *  \endcode
@@ -109,7 +108,7 @@ namespace thrust
  *  #include <thrust/device_vector.h>
  *  #include <thrust/reduce.h>
  *  #include <iostream>
- *  
+ *
  *  // note: functor inherits from unary_function
  *  struct square : public thrust::unary_function<float,float>
  *  {
@@ -119,8 +118,8 @@ namespace thrust
  *      return x * x;
  *    }
  *  };
- *  
- *  int main(void)
+ *
+ *  int main()
  *  {
  *    // initialize a device array
  *    thrust::device_vector<float> v(4);
@@ -128,29 +127,29 @@ namespace thrust
  *    v[1] = 2.0f;
  *    v[2] = 3.0f;
  *    v[3] = 4.0f;
- *  
+ *
  *    float sum_of_squares =
  *     thrust::reduce(thrust::make_transform_iterator(v.begin(), square()),
  *                    thrust::make_transform_iterator(v.end(),   square()));
- *  
+ *
  *    std::cout << "sum of squares: " << sum_of_squares << std::endl;
  *    return 0;
  *  }
  *  \endcode
  *
- *  Note that in the previous two examples the transform functor (namely \c square_root 
- *  and \c square) inherits from \c thrust::unary_function.  Inheriting from 
+ *  Note that in the previous two examples the transform functor (namely \c square_root
+ *  and \c square) inherits from \c thrust::unary_function.  Inheriting from
  *  \c thrust::unary_function ensures that a functor is a valid \c AdaptableUnaryFunction
  *  and provides all the necessary \c typedef declarations.  The \p transform_iterator
- *  can also be applied to a \c UnaryFunction that does not inherit from 
+ *  can also be applied to a \c UnaryFunction that does not inherit from
  *  \c thrust::unary_function using an optional template argument.  The following example
  *  illustrates how to use the third template argument to specify the \c result_type of
- *  the function.   
+ *  the function.
  *
  *  \code
  *  #include <thrust/iterator/transform_iterator.h>
  *  #include <thrust/device_vector.h>
- *  
+ *
  *  // note: functor *does not* inherit from unary_function
  *  struct square_root
  *  {
@@ -160,26 +159,26 @@ namespace thrust
  *      return sqrtf(x);
  *    }
  *  };
- *  
- *  int main(void)
+ *
+ *  int main()
  *  {
  *    thrust::device_vector<float> v(4);
  *    v[0] = 1.0f;
  *    v[1] = 4.0f;
  *    v[2] = 9.0f;
  *    v[3] = 16.0f;
- *                                                                                           
+ *
  *    typedef thrust::device_vector<float>::iterator FloatIterator;
- *    
+ *
  *    // note: float result_type is specified explicitly
  *    thrust::transform_iterator<square_root, FloatIterator, float> iter(v.begin(), square_root());
- *                                                                                           
+ *
  *    *iter;   // returns 1.0f
  *    iter[0]; // returns 1.0f;
  *    iter[1]; // returns 2.0f;
  *    iter[2]; // returns 3.0f;
  *    iter[3]; // returns 4.0f;
- *                                                                                           
+ *
  *    // iter[4] is an out-of-bounds error
  *  }
  *  \endcode
@@ -206,7 +205,11 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
      */
     __host__ __device__
     transform_iterator() {}
-  
+
+#if THRUST_CPP_DIALECT >= 2011
+    transform_iterator(transform_iterator const&) = default;
+#endif
+
     /*! This constructor takes as arguments an \c Iterator and an \c AdaptableUnaryFunction
      *  and copies them to a new \p transform_iterator.
      *
@@ -217,7 +220,7 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
     transform_iterator(Iterator const& x, AdaptableUnaryFunction f)
       : super_t(x), m_f(f) {
     }
-  
+
     /*! This explicit constructor copies the value of a given \c Iterator and creates
      *  this \p transform_iterator's \c AdaptableUnaryFunction using its null constructor.
      *
@@ -296,16 +299,24 @@ template <class AdaptableUnaryFunction, class Iterator, class Reference = use_de
       return *this;
     }
 
+    // MSVC 2013 and 2015 incorrectly warning about returning a reference to
+    // a local/temporary here.
+    // See goo.gl/LELTNp
+    THRUST_DISABLE_MSVC_WARNING_BEGIN(4172)
+
     __thrust_exec_check_disable__
     __host__ __device__
     typename super_t::reference dereference() const
-    { 
-      // create a temporary to allow iterators with wrapped references to convert to their value type before calling m_f
-      // note that this disallows non-constant operations through m_f
-      typename thrust::iterator_value<Iterator>::type x = *this->base();
+    {
+      // Create a temporary to allow iterators with wrapped references to
+      // convert to their value type before calling m_f. Note that this
+      // disallows non-constant operations through m_f.
+      typename thrust::iterator_value<Iterator>::type const& x = *this->base();
       return m_f(x);
     }
 
+    THRUST_DISABLE_MSVC_WARNING_END(4172)
+
     // tag this as mutable per Dave Abrahams in this thread:
     // http://lists.boost.org/Archives/boost/2004/05/65332.php
     mutable AdaptableUnaryFunction m_f;
@@ -340,5 +351,5 @@ make_transform_iterator(Iterator it, AdaptableUnaryFunction fun)
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h
new file mode 100644
index 000000000..3ac4b8572
--- /dev/null
+++ b/thrust/iterator/transform_output_iterator.h
@@ -0,0 +1,164 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/iterator/transform_output_iterator.h
+ *  \brief An output iterator which adapts another output iterator by applying a
+ *         function to the result of its dereference before writing it.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/iterator/detail/transform_output_iterator.inl>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup iterators
+ *  \{
+ */
+
+/*! \addtogroup fancyiterator Fancy Iterators
+ *  \ingroup iterators
+ *  \{
+ */
+
+/*! \p transform_output_iterator is a special kind of output iterator which
+ * transforms a value written upon dereference. This iterator is useful
+ * for transforming an output from algorithms without explicitly storing the
+ * intermediate result in the memory and applying subsequent transformation, 
+ * thereby avoiding wasting memory capacity and bandwidth.
+ * Using \p transform_iterator facilitates kernel fusion by deferring execution
+ * of transformation until the value is written while saving both memory
+ * capacity and bandwidth.
+ *
+ * The following code snippet demonstrated how to create a
+ * \p transform_output_iterator which applies \c sqrtf to the assigning value.
+ *
+ * \code
+ * #include <thrust/iterator/transform_output_iterator.h>
+ * #include <thrust/device_vector.h>
+ *
+ * // note: functor inherits form unary function
+ *  // note: functor inherits from unary_function
+ *  struct square_root : public thrust::unary_function<float,float>
+ *  {
+ *    __host__ __device__
+ *    float operator()(float x) const
+ *    {
+ *      return sqrtf(x);
+ *    }
+ *  };
+ *  
+ *  int main()
+ *  {
+ *    thrust::device_vector<float> v(4);
+ *
+ *    typedef thrust::device_vector<float>::iterator FloatIterator;
+ *    thrust::transform_output_iterator<square_root, FloatIterator> iter(v.begin(), square_root());
+ *
+ *    iter[0] =  1.0f;    // stores sqrtf( 1.0f) 
+ *    iter[1] =  4.0f;    // stores sqrtf( 4.0f)
+ *    iter[2] =  9.0f;    // stores sqrtf( 9.0f)
+ *    iter[3] = 16.0f;    // stores sqrtf(16.0f)
+ *    // iter[4] is an out-of-bounds error
+ *                                                                                           
+ *    v[0]; // returns 1.0f;
+ *    v[1]; // returns 2.0f;
+ *    v[2]; // returns 3.0f;
+ *    v[3]; // returns 4.0f;
+ *                                                                                           
+ *  }
+ *  \endcode
+ *
+ *  \see make_transform_output_iterator
+ */
+
+template <typename UnaryFunction, typename OutputIterator>
+  class transform_output_iterator
+    : public detail::transform_output_iterator_base<UnaryFunction, OutputIterator>::type
+{
+
+  /*! \cond
+   */
+
+  public:
+
+    typedef typename
+    detail::transform_output_iterator_base<UnaryFunction, OutputIterator>::type
+    super_t;
+
+    friend class thrust::iterator_core_access;
+  /*! \endcond
+   */
+
+  transform_output_iterator() = default;
+
+  /*! This constructor takes as argument an \c OutputIterator and an \c
+   * UnaryFunction and copies them to a new \p transform_output_iterator
+   *
+   * \param out An \c OutputIterator pointing to the output range whereto the result of 
+   *            \p transform_output_iterator's \c UnaryFunction will be written.
+   * \param fun An \c UnaryFunction used to transform the objects assigned to
+   *            this \p transform_output_iterator.
+   */
+    __host__ __device__
+    transform_output_iterator(OutputIterator const& out, UnaryFunction fun) : super_t(out), fun(fun)
+    {
+    }
+
+    /*! \cond
+     */
+  private:
+
+    __host__ __device__
+    typename super_t::reference dereference() const
+    {
+      return detail::transform_output_iterator_proxy<
+        UnaryFunction, OutputIterator
+      >(this->base_reference(), fun);
+    }
+
+    UnaryFunction fun;
+
+    /*! \endcond
+     */
+}; // end transform_output_iterator
+
+/*! \p make_transform_output_iterator creates a \p transform_output_iterator from
+ *  an \c OutputIterator and \c UnaryFunction.
+ *
+ *  \param out The \c OutputIterator pointing to the output range of the newly
+ *            created \p transform_output_iterator
+ *  \param fun The \c UnaryFunction transform the object before assigning it to
+ *            \c out by the newly created \p transform_output_iterator
+ *  \see transform_output_iterator
+ */
+template <typename UnaryFunction, typename OutputIterator>
+transform_output_iterator<UnaryFunction, OutputIterator>
+__host__ __device__
+make_transform_output_iterator(OutputIterator out, UnaryFunction fun)
+{
+    return transform_output_iterator<UnaryFunction, OutputIterator>(out, fun);
+} // end make_transform_output_iterator
+
+/*! \} // end fancyiterators
+ */
+
+/*! \} // end iterators
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/iterator/zip_iterator.h b/thrust/iterator/zip_iterator.h
index 76ba5870b..c2dd5ddc4 100644
--- a/thrust/iterator/zip_iterator.h
+++ b/thrust/iterator/zip_iterator.h
@@ -36,8 +36,7 @@
 #include <thrust/iterator/iterator_facade.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup iterators
  *  \{
@@ -67,7 +66,7 @@ namespace thrust
  *  int_v[0] = 0; int_v[1] = 1; int_v[2] = 2;
  *
  *  thrust::device_vector<float> float_v(3);
- *  float_v[0] = 0.0f; float_v[1] = 1.0;f float_v[2] = 2.0f;
+ *  float_v[0] = 0.0f; float_v[1] = 1.0f; float_v[2] = 2.0f;
  *
  *  thrust::device_vector<char> char_v(3);
  *  char_v[0] = 'a'; char_v[1] = 'b'; char_v[2] = 'c';
@@ -108,7 +107,7 @@ namespace thrust
  *  #include <thrust/tuple.h>
  *  #include <thrust/device_vector.h>
  *
- *  int main(void)
+ *  int main()
  *  {
  *    thrust::device_vector<int> int_in(3), int_out(3);
  *    int_in[0] = 0;
@@ -144,7 +143,7 @@ template <typename IteratorTuple>
     /*! Null constructor does nothing.
      */
     inline __host__ __device__
-    zip_iterator(void);
+    zip_iterator();
 
     /*! This constructor creates a new \p zip_iterator from a
      *  \p tuple of iterators.
@@ -229,9 +228,23 @@ template <typename IteratorTuple>
  *
  *  \see zip_iterator
  */
-template<typename IteratorTuple>
+template<typename... Iterators>
 inline __host__ __device__
-zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
+zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(thrust::tuple<Iterators...> t);
+
+
+/*! \p make_zip_iterator creates a \p zip_iterator from
+ *  iterators.
+ *
+ *  \param its The iterators to copy.
+ *  \return A newly created \p zip_iterator which zips the iterators.
+ *
+ *  \see zip_iterator
+ */
+template<typename... Iterators>
+inline __host__ __device__
+zip_iterator<thrust::tuple<Iterators...>> make_zip_iterator(Iterators... its);
+
 
 /*! \} // end fancyiterators
  */
@@ -239,7 +252,7 @@ zip_iterator<IteratorTuple> make_zip_iterator(IteratorTuple t);
 /*! \} // end iterators
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/iterator/detail/zip_iterator.inl>
 
diff --git a/thrust/limits.h b/thrust/limits.h
new file mode 100644
index 000000000..52f38b1fc
--- /dev/null
+++ b/thrust/limits.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <limits>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename T>
+struct numeric_limits : std::numeric_limits<T> {};
+
+THRUST_NAMESPACE_END
diff --git a/thrust/logical.h b/thrust/logical.h
index ce2127219..5a8dbbecf 100644
--- a/thrust/logical.h
+++ b/thrust/logical.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -50,8 +48,8 @@ namespace thrust
  *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -87,8 +85,8 @@ bool all_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, In
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if all elements satisfy the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -126,8 +124,8 @@ bool all_of(InputIterator first, InputIterator last, Predicate pred);
  *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -164,8 +162,8 @@ bool any_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, In
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if any element satisfies the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -204,8 +202,8 @@ bool any_of(InputIterator first, InputIterator last, Predicate pred);
  *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -242,8 +240,8 @@ bool none_of(const thrust::detail::execution_policy_base<DerivedPolicy> &exec, I
  *  \param pred A predicate used to test range elements.
  *  \return \c true, if no element satisfies the predicate; \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \code
  *  #include <thrust/logical.h>
@@ -272,8 +270,6 @@ bool none_of(InputIterator first, InputIterator last, Predicate pred);
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/logical.inl>
-
diff --git a/thrust/memory.h b/thrust/memory.h
index 0039cadaa..819ac2513 100644
--- a/thrust/memory.h
+++ b/thrust/memory.h
@@ -18,8 +18,9 @@
  *  \brief Abstractions for Thrust's memory model.
  */
 
-#include <thrust/detail/config.h>
+#pragma once
 
+#include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/reference.h>
@@ -28,21 +29,26 @@
 #include <thrust/detail/malloc_and_free.h>
 #include <thrust/detail/temporary_buffer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
+/*! \defgroup memory_management Memory Management
+ *
+ *  All Thrust functionalities related to memory allocation and deallocation.
+ *
+ */
 
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes Memory Management Classes
- *  \ingroup memory_management
+/** \addtogroup memory_management Memory Management
  *  \{
  */
 
+// define pointer for the purpose of Doxygenating it
+// it is actually defined elsewhere
+#if 0
 /*! \p pointer stores a pointer to an object allocated in memory. Like \p device_ptr, this
  *  type ensures type safety when dispatching standard algorithms on ranges resident in memory.
  *
  *  \p pointer generalizes \p device_ptr by relaxing the backend system associated with the \p pointer.
- *  Instead of the backend system specified by \p THRUST_DEFAULT_DEVICE_BACKEND, \p pointer's
+ *  Instead of the backend system specified by \p THRUST_DEVICE_SYSTEM, \p pointer's
  *  system is given by its second template parameter, \p Tag. For the purpose of Thrust dispatch,
  *  <tt>device_ptr<Element></tt> and <tt>pointer<Element,device_system_tag></tt> are considered equivalent.
  *
@@ -68,9 +74,6 @@ namespace thrust
  *  \see reference
  *  \see raw_pointer_cast
  */
-// define pointer for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
 template<typename Element, typename Tag, typename Reference = thrust::use_default, typename Derived = thrust::use_default>
   class pointer
 {
@@ -78,7 +81,7 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
     /*! The type of the raw pointer
      */
     typedef typename super_t::base_type raw_pointer;
-    
+
     /*! \p pointer's default constructor initializes its encapsulated pointer to \c 0
      */
     __host__ __device__
@@ -108,7 +111,8 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
               pointer<Element,Tag,Reference,Derived>
             >::type * = 0);
 
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
+    /*! Assignment operator allows assigning from another pointer-like object whose element type
+     *  is convertible to \c Element.
      *
      *  \param other The other pointer-like object to assign from.
      *  \return <tt>*this</tt>
@@ -133,141 +137,6 @@ template<typename Element, typename Tag, typename Reference = thrust::use_defaul
 };
 #endif
 
-/*! \p reference is a wrapped reference to an object stored in memory. \p reference generalizes
- *  \p device_reference by relaxing the type of pointer associated with the object. \p reference
- *  is the type of the result of dereferencing a tagged pointer-like object such as \p pointer, and
- *  intermediates operations on objects existing in a remote memory.
- *
- *  \tparam Element specifies the type of the referent object.
- *  \tparam Pointer specifies the type of the result of taking the address of \p reference.
- *  \tparam Derived allows the client to specify the name of the derived type when \p reference is used as
- *          a base class. This is useful to ensure that assignment to objects of the derived type return
- *          values of the derived type as a result. By default, this type is <tt>reference<Element,Pointer></tt>.
- */
-// define pointer for the purpose of Doxygenating it
-// it is actually defined elsewhere
-#if 0
-template<typename Element, typename Pointer, typename Derived = thrust::use_default>
-  class reference
-{
-  public:
-    /*! The type of this \p reference's wrapped pointers.
-     */
-    typedef Pointer                                              pointer;
-
-    /*! The \p value_type of this \p reference.
-     */
-    typedef typename thrust::detail::remove_const<Element>::type value_type;
-
-    /*! This copy constructor initializes this \p reference
-     *  to refer to an object pointed to by the given \p pointer. After
-     *  this \p reference is constructed, it shall refer to the
-     *  object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr);
-
-    /*! This copy constructor accepts a const reference to another
-     *  \p reference of related type. After this \p reference is constructed,
-     *  it shall refer to the same object as \p other.
-     *  
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of 
-     *  <tt>reference<const T,...></tt> from <tt>reference<T,...></tt>.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    reference(const reference<OtherElement,OtherPointer,OtherDerived> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherElement,OtherPointer,OtherDerived>::pointer,
-                pointer
-              >::type * = 0);
-
-    /*! Copy assignment operator copy assigns from another \p reference.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     */
-    __host__ __device__
-    derived_type &operator=(const reference &other);
-
-    /*! Assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>
-     *
-     *  \tparam OtherElement the element type of the other \p reference.
-     *  \tparam OtherPointer the pointer type of the other \p reference.
-     *  \tparam OtherDerived the derived type of the other \p reference.
-     */
-    template<typename OtherElement, typename OtherPointer, typename OtherDerived>
-    __host__ __device__
-    derived_type &operator=(const reference<OtherElement,OtherPointer,OtherDerived> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>static_cast<derived_type&>(*this)</tt>.
-     */
-    __host__ __device__
-    derived_type &operator=(const value_type &x);
-
-    /*! Address-of operator returns a \p pointer pointing to the object
-     *  referenced by this \p reference. It does not return the address of this
-     *  \p reference.
-     *
-     *  \return A \p pointer pointing to the referenct object.
-     */
-    __host__ __device__
-    pointer operator&() const;
-
-    /*! Conversion operator converts this \p reference to \p value_type by
-     *  returning a copy of the referent object.
-     *  
-     *  \return A copy of the referent object.
-     */
-    __host__ __device__
-    operator value_type () const;
-
-    /*! Swaps the value of the referent object with another.
-     *
-     *  \param other The other \p reference with which to swap.
-     *  \note The argument is of type \p derived_type rather than \p reference.
-     */
-    __host__ __device__
-    void swap(derived_type &other);
-
-    /*! Prefix increment operator increments the referent object.
-     *
-     *  \return <tt>static_Cast<derived_type&>(*this)</tt>.
-     *
-     *  \note Documentation for other arithmetic operators omitted for brevity.
-     */
-    derived_type &operator++();
-};
-#endif
-
-/*! \}
- */
-
-/*!
- *  \addtogroup memory_management_functions Memory Management Functions
- *  \ingroup memory_management
- *  \{
- */
-
-
-/*! \addtogroup allocation_functions
- *  \{
- */
-
-
 /*! This version of \p malloc allocates untyped uninitialized storage associated with a given system.
  *
  *  \param system The Thrust system with which to associate the storage.
@@ -277,7 +146,7 @@ template<typename Element, typename Pointer, typename Derived = thrust::use_defa
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
  *  associated with Thrust's device system.
@@ -315,7 +184,7 @@ pointer<void,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<D
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p malloc to allocate a range of memory
  *  to accomodate integers associated with Thrust's device system.
@@ -359,7 +228,7 @@ pointer<T,DerivedPolicy> malloc(const thrust::detail::execution_policy_base<Deri
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
  *
- *  \pre \p DerivedPolicy must be publically derived from <code>thrust::execution_policy<DerivedPolicy></code>.
+ *  \pre \p DerivedPolicy must be publically derived from <tt>thrust::execution_policy<DerivedPolicy></tt>.
  *
  *  The following code snippet demonstrates how to use \p get_temporary_buffer to allocate a range of memory
  *  to accomodate integers associated with Thrust's device system.
@@ -396,16 +265,6 @@ __host__ __device__
 thrust::pair<thrust::pointer<T,DerivedPolicy>, typename thrust::pointer<T,DerivedPolicy>::difference_type>
 get_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
 
-
-/*! \} allocation_functions
- */
-
-
-/*! \addtogroup deallocation_functions
- *  \{
- */
-
-
 /*! \p free deallocates the storage previously allocated by \p thrust::malloc.
  *
  *  \param system The Thrust system with which the storage is associated.
@@ -482,11 +341,7 @@ void free(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Po
  */
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
-void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p);
-
-
-/*! \} deallocation_functions
- */
+void return_temporary_buffer(const thrust::detail::execution_policy_base<DerivedPolicy> &system, Pointer p, std::ptrdiff_t n);
 
 
 /*! \p raw_pointer_cast creates a "raw" pointer from a pointer-like type,
@@ -498,8 +353,8 @@ void return_temporary_buffer(const thrust::detail::execution_policy_base<Derived
  */
 template<typename Pointer>
 __host__ __device__
-inline typename thrust::detail::pointer_traits<Pointer>::raw_pointer
-  raw_pointer_cast(const Pointer &ptr);
+typename thrust::detail::pointer_traits<Pointer>::raw_pointer
+  raw_pointer_cast(Pointer ptr);
 
 
 /*! \p raw_reference_cast creates a "raw" reference from a wrapped reference type,
@@ -515,7 +370,7 @@ inline typename thrust::detail::pointer_traits<Pointer>::raw_pointer
  */
 template<typename T>
 __host__ __device__
-inline typename detail::raw_reference<T>::type
+typename detail::raw_reference<T>::type
   raw_reference_cast(T &ref);
 
 
@@ -532,12 +387,10 @@ inline typename detail::raw_reference<T>::type
  */
 template<typename T>
 __host__ __device__
-inline typename detail::raw_reference<const T>::type
+typename detail::raw_reference<const T>::type
   raw_reference_cast(const T &ref);
 
-
-/*! \}
+/*! \} // memory_management
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/merge.h b/thrust/merge.h
index 184141f6f..724f4c167 100644
--- a/thrust/merge.h
+++ b/thrust/merge.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup merging Merging
  *  \ingroup algorithms
@@ -55,17 +53,17 @@ namespace thrust
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -90,7 +88,7 @@ namespace thrust
  *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p set_union
  *  \see \p sort
  *  \see \p is_sorted
@@ -125,17 +123,17 @@ __host__ __device__
  *  \param result The beginning of the merged output.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -155,7 +153,7 @@ __host__ __device__
  *  // result = {1, 1, 1, 2, 3, 3, 5, 5, 7, 8, 9, 11, 13}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p set_union
  *  \see \p sort
  *  \see \p is_sorted
@@ -192,14 +190,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -226,7 +224,7 @@ template<typename InputIterator1,
  *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p sort
  *  \see \p is_sorted
  */
@@ -263,14 +261,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -292,7 +290,7 @@ __host__ __device__
  *  // result = {13, 11, 9, 8, 7, 5, 5, 3, 3, 2, 1, 1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/merge.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/merge
  *  \see \p sort
  *  \see \p is_sorted
  */
@@ -340,22 +338,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -432,22 +430,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -523,19 +521,19 @@ template<typename InputIterator1, typename InputIterator2, typename InputIterato
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -617,19 +615,19 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator1's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -674,7 +672,6 @@ template<typename InputIterator1, typename InputIterator2, typename InputIterato
 /*! \} // merging
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/merge.inl>
-
diff --git a/thrust/mismatch.h b/thrust/mismatch.h
index 413db84f5..bbdf2923a 100644
--- a/thrust/mismatch.h
+++ b/thrust/mismatch.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -57,9 +55,9 @@ namespace thrust
  *  \return The first position where the sequences differ.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -109,9 +107,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::exec
  *  \param first2 The beginning of the second sequence.
  *  \return The first position where the sequences differ.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \p InputIterator1's \c value_type is equality comparable to \p InputIterator2's \c value_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -163,9 +161,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
  *  \return The first position where the sequences differ.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -217,9 +215,9 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(const thrust::detail::exec
  *  \param pred   The binary predicate to compare elements.
  *  \return The first position where the sequences differ.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Input Iterator</a>.
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Input Iterator</a>.
  *
  *  \code
  *  #include <thrust/mismatch.h>
@@ -254,7 +252,6 @@ thrust::pair<InputIterator1, InputIterator2> mismatch(InputIterator1 first1,
 /*! \} // end searching
  */
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/mismatch.inl>
-
diff --git a/thrust/mr/allocator.h b/thrust/mr/allocator.h
new file mode 100644
index 000000000..67adbe87c
--- /dev/null
+++ b/thrust/mr/allocator.h
@@ -0,0 +1,253 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief Allocator types usable with \ref Memory Resources.
+ */
+
+#pragma once
+
+#include <limits>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/exec_check_disable.h>
+#include <thrust/detail/config/memory_resource.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <thrust/mr/validator.h>
+#include <thrust/mr/polymorphic_adaptor.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup allocators Allocators
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! An \p mr::allocator is a template that fulfills the C++ requirements for Allocators,
+ *  allowing to use the NPA-based memory resources where an Allocator is required. Unlike
+ *  memory resources, but like other allocators, \p mr::allocator is typed and bound to
+ *  allocate object of a specific type, however it can be freely rebound to other types.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam MR the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final (in C++11 and beyond).
+ */
+template<typename T, class MR>
+class allocator : private validator<MR>
+{
+public:
+    /*! The pointer to void type of this allocator. */
+    typedef typename MR::pointer void_pointer;
+
+    /*! The value type allocated by this allocator. Equivalent to \p T. */
+    typedef T value_type;
+    /*! The pointer type allocated by this allocator. Equivaled to the pointer type of \p MR rebound to \p T. */
+    typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<T>::other pointer;
+    /*! The pointer to const type. Equivalent to a pointer type of \p MR rebound to <tt>const T</tt>. */
+    typedef typename thrust::detail::pointer_traits<void_pointer>::template rebind<const T>::other const_pointer;
+    /*! The reference to the type allocated by this allocator. Supports smart references. */
+    typedef typename thrust::detail::pointer_traits<pointer>::reference reference;
+    /*! The const reference to the type allocated by this allocator. Supports smart references. */
+    typedef typename thrust::detail::pointer_traits<const_pointer>::reference const_reference;
+    /*! The size type of this allocator. Always \p std::size_t. */
+    typedef std::size_t size_type;
+    /*! The difference type between pointers allocated by this allocator. */
+    typedef typename thrust::detail::pointer_traits<pointer>::difference_type difference_type;
+
+    /*! Specifies that the allocator shall be propagated on container copy assignment. */
+    typedef detail::true_type propagate_on_container_copy_assignment;
+    /*! Specifies that the allocator shall be propagated on container move assignment. */
+    typedef detail::true_type propagate_on_container_move_assignment;
+    /*! Specifies that the allocator shall be propagated on container swap. */
+    typedef detail::true_type propagate_on_container_swap;
+
+    /*! The \p rebind metafunction provides the type of an \p allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p allocator.
+         */
+        typedef allocator<U, MR> other;
+    };
+
+    /*! Calculates the maximum number of elements allocated by this allocator.
+     *
+     *  \return the maximum value of \p std::size_t, divided by the size of \p T.
+     */
+    __thrust_exec_check_disable__
+    __host__ __device__
+    size_type max_size() const
+    {
+        return (std::numeric_limits<size_type>::max)() / sizeof(T);
+    }
+
+    /*! Constructor.
+     *
+     *  \param resource the resource to be used to allocate raw memory.
+     */
+    __host__ __device__
+    allocator(MR * resource) : mem_res(resource)
+    {
+    }
+
+    /*! Copy constructor. Copies the resource pointer. */
+    template<typename U>
+    __host__ __device__
+    allocator(const allocator<U, MR> & other) : mem_res(other.resource())
+    {
+    }
+
+    /*! Allocates objects of type \p T.
+     *
+     *  \param n number of elements to allocate
+     *  \return a pointer to the newly allocated storage.
+     */
+    THRUST_NODISCARD
+    __host__
+    pointer allocate(size_type n)
+    {
+        return static_cast<pointer>(mem_res->do_allocate(n * sizeof(T), THRUST_ALIGNOF(T)));
+    }
+
+    /*! Deallocates objects of type \p T.
+     *
+     *  \param p pointer returned by a previous call to \p allocate
+     *  \param n number of elements, passed as an argument to the \p allocate call that produced \p p
+     */
+    __host__
+    void deallocate(pointer p, size_type n)
+    {
+        return mem_res->do_deallocate(p, n * sizeof(T), THRUST_ALIGNOF(T));
+    }
+
+    /*! Extracts the memory resource used by this allocator.
+     *
+     *  \return the memory resource used by this allocator.
+     */
+    __host__ __device__
+    MR * resource() const
+    {
+        return mem_res;
+    }
+
+private:
+    MR * mem_res;
+};
+
+/*! Compares the allocators for equality by comparing the underlying memory resources. */
+template<typename T, typename MR>
+__host__ __device__
+bool operator==(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) noexcept
+{
+    return *lhs.resource() == *rhs.resource();
+}
+
+/*! Compares the allocators for inequality by comparing the underlying memory resources. */
+template<typename T, typename MR>
+__host__ __device__
+bool operator!=(const allocator<T, MR> & lhs, const allocator<T, MR> & rhs) noexcept
+{
+    return !(lhs == rhs);
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template<typename T, typename Pointer>
+using polymorphic_allocator = allocator<T, polymorphic_adaptor_resource<Pointer> >;
+
+#else // C++11
+
+template<typename T, typename Pointer>
+class polymorphic_allocator : public allocator<T, polymorphic_adaptor_resource<Pointer> >
+{
+    typedef allocator<T, polymorphic_adaptor_resource<Pointer> > base;
+
+public:
+    /*! Initializes the base class with the parameter \p resource.
+     */
+    polymorphic_allocator(polymorphic_adaptor_resource<Pointer>  * resource) : base(resource)
+    {
+    }
+};
+
+#endif // C++11
+
+/*! A helper allocator class that uses global instances of a given upstream memory resource. Requires the memory resource
+ *      to be default constructible.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam Upstream the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final (in C++11 and beyond).
+ */
+template<typename T, typename Upstream>
+class stateless_resource_allocator : public thrust::mr::allocator<T, Upstream>
+{
+    typedef thrust::mr::allocator<T, Upstream> base;
+
+public:
+    /*! The \p rebind metafunction provides the type of an \p stateless_resource_allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p stateless_resource_allocator.
+         */
+        typedef stateless_resource_allocator<U, Upstream> other;
+    };
+
+    /*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
+     *      \p allocator base subobject with that resource.
+     */
+    __thrust_exec_check_disable__
+    __host__ __device__
+    stateless_resource_allocator() : base(get_global_resource<Upstream>())
+    {
+    }
+
+    /*! Copy constructor. Copies the memory resource pointer. */
+    __host__ __device__
+    stateless_resource_allocator(const stateless_resource_allocator & other)
+        : base(other) {}
+
+    /*! Conversion constructor from an allocator of a different type. Copies the memory resource pointer. */
+    template<typename U>
+    __host__ __device__
+    stateless_resource_allocator(const stateless_resource_allocator<U, Upstream> & other)
+        : base(other) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+    stateless_resource_allocator & operator=(const stateless_resource_allocator &) = default;
+#endif
+
+    /*! Destructor. */
+    __host__ __device__
+    ~stateless_resource_allocator() {}
+};
+
+/*! \} // allocators
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/device_memory_resource.h b/thrust/mr/device_memory_resource.h
new file mode 100644
index 000000000..3a671142a
--- /dev/null
+++ b/thrust/mr/device_memory_resource.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's memory_resource header
+#define __THRUST_DEVICE_SYSTEM_MEMORY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/memory_resource.h>
+#include __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::memory_resource
+    device_memory_resource;
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_memory_resource
+    universal_memory_resource;
+typedef thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_host_pinned_memory_resource
+    universal_host_pinned_memory_resource;
+
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/disjoint_pool.h b/thrust/mr/disjoint_pool.h
new file mode 100644
index 000000000..b00a8644c
--- /dev/null
+++ b/thrust/mr/disjoint_pool.h
@@ -0,0 +1,489 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief A caching and pooling memory resource adaptor which uses separate upstream resources for memory allocation
+ *      and bookkeeping.
+ */
+
+#pragma once
+
+#include <thrust/detail/algorithm_wrapper.h>
+#include <thrust/detail/config.h>
+
+#include <thrust/host_vector.h>
+#include <thrust/binary_search.h>
+#include <thrust/detail/seq.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/pool_options.h>
+
+#include <cassert>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A memory resource adaptor allowing for pooling and caching allocations from \p Upstream, using \p Bookkeeper for
+ *      management of that cached and pooled memory, allowing to cache portions of memory inaccessible from the host.
+ *
+ *  On a typical memory resource, calls to \p allocate and \p deallocate actually allocate and deallocate memory. Pooling
+ *      memory resources only allocate and deallocate memory from an external resource (the upstream memory resource) when
+ *      there's no suitable memory currently cached; otherwise, they use memory they have acquired beforehand, to make
+ *      memory allocation faster and more efficient.
+ *
+ *  The disjoint version of the pool resources uses a separate upstream memory resource, \p Bookkeeper, to allocate memory
+ *      necessary to manage the cached memory. There may be many reasons to do that; the canonical one is that \p Upstream
+ *      allocates memory that is inaccessible to the code of the pool resource, which means that it cannot embed the necessary
+ *      information in memory obtained from \p Upstream; for instance, \p Upstream can be a CUDA non-managed memory
+ *      resource, or a CUDA managed memory resource whose memory we would prefer to not migrate back and forth between
+ *      host and device when executing bookkeeping code.
+ *
+ *  This is not the only case where it makes sense to use a disjoint pool resource, though. In a multi-core environment
+ *      it may be beneficial to avoid stealing cache lines from other cores by writing over bookkeeping information
+ *      embedded in an allocated block of memory. In such a case, one can imagine wanting to use a disjoint pool where
+ *      both the upstream and the bookkeeper are of the same type, to allocate memory consistently, but separately for
+ *      those two purposes.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks to be handed off to the user
+ *  \tparam Bookkeeper the type of memory resources that will be used for allocating bookkeeping memory
+ */
+template<typename Upstream, typename Bookkeeper>
+class disjoint_unsynchronized_pool_resource final
+    : public memory_resource<typename Upstream::pointer>,
+        private validator2<Upstream, Bookkeeper>
+{
+public:
+    /*! Get the default options for a disjoint pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        pool_options ret;
+
+        ret.min_blocks_per_chunk = 16;
+        ret.min_bytes_per_chunk = 1024;
+        ret.max_blocks_per_chunk = static_cast<std::size_t>(1) << 20;
+        ret.max_bytes_per_chunk = static_cast<std::size_t>(1) << 30;
+
+        ret.smallest_block_size = THRUST_MR_DEFAULT_ALIGNMENT;
+        ret.largest_block_size = static_cast<std::size_t>(1) << 20;
+
+        ret.alignment = THRUST_MR_DEFAULT_ALIGNMENT;
+
+        ret.cache_oversized = true;
+
+        ret.cached_size_cutoff_factor = 16;
+        ret.cached_alignment_cutoff_factor = 16;
+
+        return ret;
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param bookkeeper the upstream memory resource for bookkeeping
+     *  \param options pool options to use
+     */
+    disjoint_unsynchronized_pool_resource(Upstream * upstream, Bookkeeper * bookkeeper,
+        pool_options options = get_default_options())
+        : m_upstream(upstream),
+        m_bookkeeper(bookkeeper),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(m_bookkeeper),
+        m_allocated(m_bookkeeper),
+        m_cached_oversized(m_bookkeeper),
+        m_oversized(m_bookkeeper)
+    {
+        assert(m_options.validate());
+
+        pointer_vector free(m_bookkeeper);
+        pool p(free);
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    // TODO: C++11: use delegating constructors
+
+    /*! Constructor. Upstream and bookkeeping resources are obtained by calling \p get_global_resource for their types.
+     *
+     *  \param options pool options to use
+     */
+    disjoint_unsynchronized_pool_resource(pool_options options = get_default_options())
+        : m_upstream(get_global_resource<Upstream>()),
+        m_bookkeeper(get_global_resource<Bookkeeper>()),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(m_bookkeeper),
+        m_allocated(m_bookkeeper),
+        m_cached_oversized(m_bookkeeper),
+        m_oversized(m_bookkeeper)
+    {
+        assert(m_options.validate());
+
+        pointer_vector free(m_bookkeeper);
+        pool p(free);
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    /*! Destructor. Releases all held memory to upstream.
+     */
+    ~disjoint_unsynchronized_pool_resource()
+    {
+        release();
+    }
+
+private:
+    typedef typename Upstream::pointer void_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<char>::other char_ptr;
+
+    struct chunk_descriptor
+    {
+        std::size_t size;
+        void_ptr pointer;
+    };
+
+    typedef thrust::host_vector<
+        chunk_descriptor,
+        allocator<chunk_descriptor, Bookkeeper>
+    > chunk_vector;
+
+    struct oversized_block_descriptor
+    {
+        std::size_t size;
+        std::size_t alignment;
+        void_ptr pointer;
+
+        __host__ __device__
+        bool operator==(const oversized_block_descriptor & other) const
+        {
+            return size == other.size && alignment == other.alignment && pointer == other.pointer;
+        }
+
+        __host__ __device__
+        bool operator<(const oversized_block_descriptor & other) const
+        {
+            return size < other.size || (size == other.size && alignment < other.alignment);
+        }
+    };
+
+    struct equal_pointers
+    {
+    public:
+        __host__ __device__
+        equal_pointers(void_ptr p) : p(p)
+        {
+        }
+
+        __host__ __device__
+        bool operator()(const oversized_block_descriptor & desc) const
+        {
+            return desc.pointer == p;
+        }
+
+    private:
+        void_ptr p;
+    };
+
+    struct matching_alignment
+    {
+    public:
+        __host__ __device__
+        matching_alignment(std::size_t requested) : requested(requested)
+        {
+        }
+
+        __host__ __device__
+        bool operator()(const oversized_block_descriptor & desc) const
+        {
+            return desc.alignment >= requested;
+        }
+
+    private:
+        std::size_t requested;
+    };
+
+    typedef thrust::host_vector<
+        oversized_block_descriptor,
+        allocator<oversized_block_descriptor, Bookkeeper>
+    > oversized_block_vector;
+
+    typedef thrust::host_vector<
+        void_ptr,
+        allocator<void_ptr, Bookkeeper>
+    > pointer_vector;
+
+    struct pool
+    {
+        __host__
+        pool(const pointer_vector & free)
+            : free_blocks(free),
+            previous_allocated_count(0)
+        {
+        }
+
+        __host__
+        pool(const pool & other)
+            : free_blocks(other.free_blocks),
+            previous_allocated_count(other.previous_allocated_count)
+        {
+        }
+
+#if THRUST_CPP_DIALECT >= 2011
+        pool & operator=(const pool &) = default;
+#endif
+
+        __host__
+        ~pool() {}
+
+        pointer_vector free_blocks;
+        std::size_t previous_allocated_count;
+    };
+
+    typedef thrust::host_vector<
+        pool,
+        allocator<pool, Bookkeeper>
+    > pool_vector;
+
+    Upstream * m_upstream;
+    Bookkeeper * m_bookkeeper;
+
+    pool_options m_options;
+    std::size_t m_smallest_block_log2;
+
+    // buckets containing free lists for each pooled size
+    pool_vector m_pools;
+    // list of all allocations from upstream for the above
+    chunk_vector m_allocated;
+    // list of all cached oversized/overaligned blocks that have been returned to the pool to cache
+    oversized_block_vector m_cached_oversized;
+    // list of all oversized/overaligned allocations from upstream
+    oversized_block_vector m_oversized;
+
+public:
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        // reset the buckets
+        for (std::size_t i = 0; i < m_pools.size(); ++i)
+        {
+            m_pools[i].free_blocks.clear();
+            m_pools[i].previous_allocated_count = 0;
+        }
+
+        // deallocate memory allocated for the buckets
+        for (std::size_t i = 0; i < m_allocated.size(); ++i)
+        {
+            m_upstream->do_deallocate(
+                m_allocated[i].pointer,
+                m_allocated[i].size,
+                m_options.alignment);
+        }
+
+        // deallocate cached oversized/overaligned memory
+        for (std::size_t i = 0; i < m_oversized.size(); ++i)
+        {
+            m_upstream->do_deallocate(
+                m_oversized[i].pointer,
+                m_oversized[i].size,
+                m_oversized[i].alignment);
+        }
+
+        m_allocated.clear();
+        m_oversized.clear();
+        m_cached_oversized.clear();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        bytes = (std::max)(bytes, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // an oversized and/or overaligned allocation requested; needs to be allocated separately
+        if (bytes > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            oversized_block_descriptor oversized;
+            oversized.size = bytes;
+            oversized.alignment = alignment;
+
+            if (m_options.cache_oversized && !m_cached_oversized.empty())
+            {
+                typename oversized_block_vector::iterator it = thrust::lower_bound(
+                    thrust::seq,
+                    m_cached_oversized.begin(),
+                    m_cached_oversized.end(),
+                    oversized);
+
+                // if the size is bigger than the requested size by a factor
+                // bigger than or equal to the specified cutoff for size,
+                // allocate a new block
+                if (it != m_cached_oversized.end())
+                {
+                    std::size_t size_factor = (*it).size / bytes;
+                    if (size_factor >= m_options.cached_size_cutoff_factor)
+                    {
+                        it = m_cached_oversized.end();
+                    }
+                }
+
+                if (it != m_cached_oversized.end() && (*it).alignment < alignment)
+                {
+                    it = find_if(it + 1, m_cached_oversized.end(), matching_alignment(alignment));
+                }
+
+                // if the alignment is bigger than the requested one by a factor
+                // bigger than or equal to the specified cutoff for alignment,
+                // allocate a new block
+                if (it != m_cached_oversized.end())
+                {
+                    std::size_t alignment_factor = (*it).alignment / alignment;
+                    if (alignment_factor >= m_options.cached_alignment_cutoff_factor)
+                    {
+                        it = m_cached_oversized.end();
+                    }
+                }
+
+                if (it != m_cached_oversized.end())
+                {
+                    oversized.pointer = (*it).pointer;
+                    m_cached_oversized.erase(it);
+                    return oversized.pointer;
+                }
+            }
+
+            // no fitting cached block found; allocate a new one that's just up to the specs
+            oversized.pointer = m_upstream->do_allocate(bytes, alignment);
+            m_oversized.push_back(oversized);
+
+            return oversized.pointer;
+        }
+
+        // the request is NOT for oversized and/or overaligned memory
+        // allocate a block from an appropriate bucket
+        std::size_t bytes_log2 = thrust::detail::log2_ri(bytes);
+        std::size_t bucket_idx = bytes_log2 - m_smallest_block_log2;
+        pool & bucket = m_pools[bucket_idx];
+
+        // if the free list of the bucket has no elements, allocate a new chunk
+        // and split it into blocks pushed to the free list
+        if (bucket.free_blocks.empty())
+        {
+            std::size_t bucket_size = static_cast<std::size_t>(1) << bytes_log2;
+
+            std::size_t n = bucket.previous_allocated_count;
+            if (n == 0)
+            {
+                n = m_options.min_blocks_per_chunk;
+                if (n < (m_options.min_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.min_bytes_per_chunk >> bytes_log2;
+                }
+            }
+            else
+            {
+                n = n * 3 / 2;
+                if (n > (m_options.max_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.max_bytes_per_chunk >> bytes_log2;
+                }
+                if (n > m_options.max_blocks_per_chunk)
+                {
+                    n = m_options.max_blocks_per_chunk;
+                }
+            }
+
+            bytes = n << bytes_log2;
+
+            assert(n >= m_options.min_blocks_per_chunk);
+            assert(n <= m_options.max_blocks_per_chunk);
+            assert(bytes >= m_options.min_bytes_per_chunk);
+            assert(bytes <= m_options.max_bytes_per_chunk);
+
+            chunk_descriptor allocated;
+            allocated.size = bytes;
+            allocated.pointer = m_upstream->do_allocate(bytes, m_options.alignment);
+            m_allocated.push_back(allocated);
+            bucket.previous_allocated_count = n;
+
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                bucket.free_blocks.push_back(
+                    static_cast<void_ptr>(
+                        static_cast<char_ptr>(allocated.pointer) + i * bucket_size
+                    )
+                );
+            }
+        }
+
+        // allocate a block from the front of the bucket's free list
+        void_ptr ret = bucket.free_blocks.back();
+        bucket.free_blocks.pop_back();
+        return ret;
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        n = (std::max)(n, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // verify that the pointer is at least as aligned as claimed
+        assert(reinterpret_cast<detail::intmax_t>(detail::pointer_traits<void_ptr>::get(p)) % alignment == 0);
+
+        // the deallocated block is oversized and/or overaligned
+        if (n > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            typename oversized_block_vector::iterator it = find_if(m_oversized.begin(), m_oversized.end(), equal_pointers(p));
+            assert(it != m_oversized.end());
+
+            oversized_block_descriptor oversized = *it;
+
+            if (m_options.cache_oversized)
+            {
+                typename oversized_block_vector::iterator position = lower_bound(m_cached_oversized.begin(), m_cached_oversized.end(), oversized);
+                m_cached_oversized.insert(position, oversized);
+                return;
+            }
+
+            m_oversized.erase(it);
+
+            m_upstream->do_deallocate(p, oversized.size, oversized.alignment);
+
+            return;
+        }
+
+        // push the block to the front of the appropriate bucket's free list
+        std::size_t n_log2 = thrust::detail::log2_ri(n);
+        std::size_t bucket_idx = n_log2 - m_smallest_block_log2;
+        pool & bucket = m_pools[bucket_idx];
+
+        bucket.free_blocks.push_back(p);
+    }
+};
+
+/*! \} // memory_resource
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/disjoint_sync_pool.h b/thrust/mr/disjoint_sync_pool.h
new file mode 100644
index 000000000..ed81ae4cb
--- /dev/null
+++ b/thrust/mr/disjoint_sync_pool.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief A mutex-synchronized version of \p disjoint_unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <mutex>
+
+#include <thrust/mr/disjoint_pool.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A mutex-synchronized version of \p disjoint_unsynchronized_pool_resource. Uses \p std::mutex, and therefore requires C++11.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks to be handed off to the user
+ *  \tparam Bookkeeper the type of memory resources that will be used for allocating bookkeeping memory
+ */
+template<typename Upstream, typename Bookkeeper>
+struct disjoint_synchronized_pool_resource : public memory_resource<typename Upstream::pointer>
+{
+    typedef disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> unsync_pool;
+    typedef std::lock_guard<std::mutex> lock_t;
+
+    typedef typename Upstream::pointer void_ptr;
+
+public:
+    /*! Get the default options for a disjoint pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        return unsync_pool::get_default_options();
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param bookkeeper the upstream memory resource for bookkeeping
+     *  \param options pool options to use
+     */
+    disjoint_synchronized_pool_resource(Upstream * upstream, Bookkeeper * bookkeeper,
+        pool_options options = get_default_options())
+        : upstream_pool(upstream, bookkeeper, options)
+    {
+    }
+
+    /*! Constructor. Upstream and bookkeeping resources are obtained by calling \p get_global_resource for their types.
+     *
+     *  \param options pool options to use
+     */
+    disjoint_synchronized_pool_resource(pool_options options = get_default_options())
+        : upstream_pool(get_global_resource<Upstream>(), get_global_resource<Bookkeeper>(), options)
+    {
+    }
+
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        lock_t lock(mtx);
+        upstream_pool.release();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        lock_t lock(mtx);
+        return upstream_pool.do_allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        lock_t lock(mtx);
+        upstream_pool.do_deallocate(p, n, alignment);
+    }
+
+private:
+    std::mutex mtx;
+    unsync_pool upstream_pool;
+};
+
+/*! \} // memory_resources
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/mr/disjoint_tls_pool.h b/thrust/mr/disjoint_tls_pool.h
new file mode 100644
index 000000000..9fc7917ca
--- /dev/null
+++ b/thrust/mr/disjoint_tls_pool.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file disjoint_tls_pool.h
+ *  \brief A function wrapping a thread local instance of a \p disjoint_unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/mr/disjoint_pool.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! Potentially constructs, if not yet created, and then returns the address of a thread-local
+ *      \p disjoint_unsynchronized_pool_resource,
+ *
+ *  \tparam Upstream the first template argument to the pool template
+ *  \tparam Bookkeeper the second template argument to the pool template
+ *  \param upstream the first argument to the constructor, if invoked
+ *  \param bookkeeper the second argument to the constructor, if invoked
+ */
+template<typename Upstream, typename Bookkeeper>
+__host__
+thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper> & tls_disjoint_pool(
+    Upstream * upstream = NULL,
+    Bookkeeper * bookkeeper = NULL)
+{
+    static thread_local auto adaptor = [&]{
+        assert(upstream && bookkeeper);
+        return thrust::mr::disjoint_unsynchronized_pool_resource<Upstream, Bookkeeper>(upstream, bookkeeper);
+    }();
+
+    return adaptor;
+}
+
+/*! \}
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/mr/fancy_pointer_resource.h b/thrust/mr/fancy_pointer_resource.h
new file mode 100644
index 000000000..b88107564
--- /dev/null
+++ b/thrust/mr/fancy_pointer_resource.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/validator.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+template<typename Upstream, typename Pointer>
+class fancy_pointer_resource final : public memory_resource<Pointer>, private validator<Upstream>
+{
+public:
+    fancy_pointer_resource() : m_upstream(get_global_resource<Upstream>())
+    {
+    }
+
+    fancy_pointer_resource(Upstream * upstream) : m_upstream(upstream)
+    {
+    }
+
+    THRUST_NODISCARD
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        return static_cast<Pointer>(m_upstream->do_allocate(bytes, alignment));
+    }
+
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
+    {
+        return m_upstream->do_deallocate(
+            static_cast<typename Upstream::pointer>(
+                thrust::detail::pointer_traits<Pointer>::get(p)),
+            bytes, alignment);
+    }
+
+private:
+    Upstream * m_upstream;
+};
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cuda/detail/throw_on_error.h b/thrust/mr/host_memory_resource.h
similarity index 60%
rename from thrust/system/cuda/detail/throw_on_error.h
rename to thrust/mr/host_memory_resource.h
index 9d5f509d0..9359a97a7 100644
--- a/thrust/system/cuda/detail/throw_on_error.h
+++ b/thrust/mr/host_memory_resource.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2012 NVIDIA Corporation
+ *  Copyright 2018-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,29 +17,16 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <cstdio>
 
+// #include the host system's memory_resource header
+#define __THRUST_HOST_SYSTEM_MEMORY_HEADER <__THRUST_HOST_SYSTEM_ROOT/memory_resource.h>
+#include __THRUST_HOST_SYSTEM_MEMORY_HEADER
+#undef __THRUST_HOST_SYSTEM_MEMORY_HEADER
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+THRUST_NAMESPACE_BEGIN
 
+typedef thrust::system::__THRUST_HOST_SYSTEM_NAMESPACE::memory_resource
+    host_memory_resource;
 
-inline __host__ __device__
-void throw_on_error(cudaError_t error, const char *message)
-{
-  thrust::system::cuda::detail::bulk_::detail::throw_on_error(error, message);
-}
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/mr/memory_resource.h b/thrust/mr/memory_resource.h
new file mode 100644
index 000000000..6af2f167c
--- /dev/null
+++ b/thrust/mr/memory_resource.h
@@ -0,0 +1,217 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief A base class for the memory resource system, similar to
+ *  std::memory_resource, and related utilities.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/memory_resource.h>
+#ifdef THRUST_MR_STD_MR_HEADER
+#  include THRUST_MR_STD_MR_HEADER
+#endif
+
+THRUST_NAMESPACE_BEGIN
+/*! \brief \p thrust::mr is the namespace containing system agnostic types and functions for \p memory_resource related functionalities.
+ */
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! \p memory_resource is the base class for all other memory resources.
+ *
+ *  \tparam Pointer the pointer type that is allocated and deallocated by the memory resource
+ *      derived from this base class. If this is <tt>void *</tt>, this class derives from
+ *      <tt>std::pmr::memory_resource</tt>.
+ */
+template<typename Pointer = void *>
+class memory_resource
+{
+public:
+    /*! Alias for the template parameter.
+     */
+    typedef Pointer pointer;
+
+    /*! Virtual destructor, defaulted when possible.
+     */
+    virtual ~memory_resource() = default;
+
+    /*! Allocates memory of size at least \p bytes and alignment at least \p alignment.
+     *
+     *  \param bytes size, in bytes, that is requested from this allocation
+     *  \param alignment alignment that is requested from this allocation
+     *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
+     *  \return A pointer to void to the newly allocated memory.
+     */
+    THRUST_NODISCARD
+    pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        return do_allocate(bytes, alignment);
+    }
+
+    /*! Deallocates memory pointed to by \p p.
+     *
+     *  \param p pointer to be deallocated
+     *  \param bytes the size of the allocation. This must be equivalent to the value of \p bytes that
+     *      was passed to the allocation function that returned \p p.
+     *  \param alignment the alignment of the allocation. This must be equivalent to the value of \p alignment
+     *      that was passed to the allocation function that returned \p p.
+     */
+    void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        do_deallocate(p, bytes, alignment);
+    }
+
+    /*! Compares this resource to the other one. The default implementation uses identity comparison,
+     *      which is often the right thing to do and doesn't require RTTI involvement.
+     *
+     *  \param other the other resource to compare this resource to
+     *  \return whether the two resources are equivalent.
+     */
+    __host__ __device__
+    bool is_equal(const memory_resource & other) const noexcept
+    {
+        return do_is_equal(other);
+    }
+
+    /*! Allocates memory of size at least \p bytes and alignment at least \p alignment.
+     *
+     *  \param bytes size, in bytes, that is requested from this allocation
+     *  \param alignment alignment that is requested from this allocation
+     *  \throws thrust::bad_alloc when no memory with requested size and alignment can be allocated.
+     *  \return A pointer to void to the newly allocated memory.
+     */
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
+
+    /*! Deallocates memory pointed to by \p p.
+     *
+     *  \param p pointer to be deallocated
+     *  \param bytes the size of the allocation. This must be equivalent to the value of \p bytes that
+     *      was passed to the allocation function that returned \p p.
+     *  \param alignment the size of the allocation. This must be equivalent to the value of \p alignment
+     *      that was passed to the allocation function that returned \p p.
+     */
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) = 0;
+
+    /*! Compares this resource to the other one. The default implementation uses identity comparison,
+     *      which is often the right thing to do and doesn't require RTTI involvement.
+     *
+     *  \param other the other resource to compare this resource to
+     *  \return whether the two resources are equivalent.
+     */
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource & other) const noexcept
+    {
+        return this == &other;
+    }
+};
+
+template<>
+class memory_resource<void *>
+#ifdef THRUST_STD_MR_NS
+    : THRUST_STD_MR_NS::memory_resource
+#endif
+{
+public:
+    typedef void * pointer;
+
+    virtual ~memory_resource() = default;
+
+    THRUST_NODISCARD
+    pointer allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        return do_allocate(bytes, alignment);
+    }
+
+    void deallocate(pointer p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT)
+    {
+        do_deallocate(p, bytes, alignment);
+    }
+
+    __host__ __device__
+    bool is_equal(const memory_resource & other) const noexcept
+    {
+        return do_is_equal(other);
+    }
+
+    virtual pointer do_allocate(std::size_t bytes, std::size_t alignment) = 0;
+    virtual void do_deallocate(pointer p, std::size_t bytes, std::size_t alignment) = 0;
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource & other) const noexcept
+    {
+        return this == &other;
+    }
+
+#ifdef THRUST_STD_MR_NS
+    // the above do_is_equal is a different function than the one from the standard memory resource
+    // can't implement this reasonably without RTTI though; it's reasonable to assume false otherwise
+
+    virtual bool do_is_equal(const THRUST_STD_MR_NS::memory_resource & other) const noexcept override
+    {
+#  ifdef THRUST_HAS_DYNAMIC_CAST
+        auto mr_resource = dynamic_cast<memory_resource<> *>(&other);
+        return mr_resource && do_is_equal(*mr_resource);
+#  else
+        return this == &other;
+#  endif
+    }
+#endif
+};
+
+/*! Compares the memory resources for equality, first by identity, then by \p is_equal.
+ */
+template<typename Pointer>
+__host__ __device__
+bool operator==(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) noexcept
+{
+    return &lhs == &rhs || rhs.is_equal(rhs);
+}
+
+/*! Compares the memory resources for inequality, first by identity, then by \p is_equal.
+ */
+template<typename Pointer>
+__host__ __device__
+bool operator!=(const memory_resource<Pointer> & lhs, const memory_resource<Pointer> & rhs) noexcept
+{
+    return !(lhs == rhs);
+}
+
+/*! Returns a global instance of \p MR, created as a function local static variable.
+ *
+ *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
+ *  \return a pointer to a global instance of \p MR.
+ */
+template<typename MR>
+__host__
+MR * get_global_resource()
+{
+    static MR resource;
+    return &resource;
+}
+
+/*! \} // memory_resource
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/new.h b/thrust/mr/new.h
new file mode 100644
index 000000000..644e25169
--- /dev/null
+++ b/thrust/mr/new.h
@@ -0,0 +1,89 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Global operator new-based memory resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/mr/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A memory resource that uses global operators new and delete to allocate and deallocate memory. Uses alignment-enabled
+ *      overloads when available, otherwise uses regular overloads and implements alignment requirements by itself.
+ */
+class new_delete_resource final : public memory_resource<>
+{
+public:
+    void * do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+#if defined(__cpp_aligned_new)
+        return ::operator new(bytes, std::align_val_t(alignment));
+#else
+        // allocate memory for bytes, plus potential alignment correction,
+        // plus store of the correction offset
+        void * p = ::operator new(bytes + alignment + sizeof(std::size_t));
+        std::size_t ptr_int = reinterpret_cast<std::size_t>(p);
+        // calculate the offset, i.e. how many bytes of correction was necessary
+        // to get an aligned pointer
+        std::size_t offset = (ptr_int % alignment) ? (alignment - ptr_int % alignment) : 0;
+        // calculate the return pointer
+        char * ptr = static_cast<char *>(p) + offset;
+        // store the offset right after the actually returned value
+        std::size_t * offset_store = reinterpret_cast<std::size_t *>(ptr + bytes);
+        *offset_store = offset;
+        return static_cast<void *>(ptr);
+#endif
+    }
+
+    void do_deallocate(void * p, std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+#if defined(__cpp_aligned_new)
+# if defined(__cpp_sized_deallocation)
+        ::operator delete(p, bytes, std::align_val_t(alignment));
+# else
+        (void)bytes;
+        ::operator delete(p, std::align_val_t(alignment));
+# endif
+#else
+        (void)alignment;
+        char * ptr = static_cast<char *>(p);
+        // calculate where the offset is stored
+        std::size_t * offset = reinterpret_cast<std::size_t *>(ptr + bytes);
+        // calculate the original pointer
+        p = static_cast<void *>(ptr - *offset);
+        ::operator delete(p);
+#endif
+    }
+};
+
+/*! \} // memory_resources
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/polymorphic_adaptor.h b/thrust/mr/polymorphic_adaptor.h
new file mode 100644
index 000000000..0562a8f82
--- /dev/null
+++ b/thrust/mr/polymorphic_adaptor.h
@@ -0,0 +1,57 @@
+/*
+ *  Copyright 2018-2019 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/mr/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+template<typename Pointer = void *>
+class polymorphic_adaptor_resource final : public memory_resource<Pointer>
+{
+public:
+    polymorphic_adaptor_resource(memory_resource<Pointer> * t) : upstream_resource(t)
+    {
+    }
+
+    virtual Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        return upstream_resource->allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
+    {
+        return upstream_resource->deallocate(p, bytes, alignment);
+    }
+
+    __host__ __device__
+    virtual bool do_is_equal(const memory_resource<Pointer> & other) const noexcept override
+    {
+        return upstream_resource->is_equal(other);
+    }
+
+private:
+    memory_resource<Pointer> * upstream_resource;
+};
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/pool.h b/thrust/mr/pool.h
new file mode 100644
index 000000000..6259a23f1
--- /dev/null
+++ b/thrust/mr/pool.h
@@ -0,0 +1,507 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief A caching and pooling memory resource adaptor which uses a single
+ *  upstream resource for memory allocation, and embeds bookkeeping information
+ *  in allocated blocks.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/algorithm_wrapper.h>
+
+#include <thrust/host_vector.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/pool_options.h>
+
+#include <cassert>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A memory resource adaptor allowing for pooling and caching allocations from \p Upstream, using memory allocated
+ *      from it for both blocks then allocated to the user and for internal bookkeeping of the cached memory.
+ *
+ *  On a typical memory resource, calls to \p allocate and \p deallocate actually allocate and deallocate memory. Pooling
+ *      memory resources only allocate and deallocate memory from an external resource (the upstream memory resource) when
+ *      there's no suitable memory currently cached; otherwise, they use memory they have acquired beforehand, to make
+ *      memory allocation faster and more efficient.
+ *
+ *  The non-disjoint version of the pool resource uses a single upstream memory resource. Every allocation is larger than
+ *      strictly necessary to fulfill the end-user's request, because it needs to account for the memory overhead of tracking
+ *      the memory blocks and chunks inside those same memory regions. Nevertheless, this version should be more memory-efficient
+ *      than the \p disjoint_unsynchronized_pool_resource, because it doesn't need to allocate additional blocks of memory
+ *      from a separate resource, which in turn would necessitate the bookkeeping overhead in the upstream resource.
+ *
+ *  This version requires that memory allocated from Upstream is accessible from device. It supports smart references,
+ *      meaning that the non-managed CUDA resource, returning a device-tagged pointer, will work, but will be much less
+ *      efficient than the disjoint version, which wouldn't need to touch device memory at all, and therefore wouldn't need
+ *      to transfer it back and forth between the host and the device whenever an allocation or a deallocation happens.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory blocks
+ */
+template<typename Upstream>
+class unsynchronized_pool_resource final
+    : public memory_resource<typename Upstream::pointer>,
+        private validator<Upstream>
+{
+public:
+    /*! Get the default options for a pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        pool_options ret;
+
+        ret.min_blocks_per_chunk = 16;
+        ret.min_bytes_per_chunk = 1024;
+        ret.max_blocks_per_chunk = static_cast<std::size_t>(1) << 20;
+        ret.max_bytes_per_chunk = static_cast<std::size_t>(1) << 30;
+
+        ret.smallest_block_size = THRUST_MR_DEFAULT_ALIGNMENT;
+        ret.largest_block_size = static_cast<std::size_t>(1) << 20;
+
+        ret.alignment = THRUST_MR_DEFAULT_ALIGNMENT;
+
+        ret.cache_oversized = true;
+
+        ret.cached_size_cutoff_factor = 16;
+        ret.cached_alignment_cutoff_factor = 16;
+
+        return ret;
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param options pool options to use
+     */
+    unsynchronized_pool_resource(Upstream * upstream, pool_options options = get_default_options())
+        : m_upstream(upstream),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(upstream),
+        m_allocated(),
+        m_oversized(),
+        m_cached_oversized()
+    {
+        assert(m_options.validate());
+
+        pool p = { block_descriptor_ptr(), 0 };
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    // TODO: C++11: use delegating constructors
+
+    /*! Constructor. The upstream resource is obtained by calling \p get_global_resource<Upstream>.
+     *
+     *  \param options pool options to use
+     */
+    unsynchronized_pool_resource(pool_options options = get_default_options())
+        : m_upstream(get_global_resource<Upstream>()),
+        m_options(options),
+        m_smallest_block_log2(detail::log2_ri(m_options.smallest_block_size)),
+        m_pools(get_global_resource<Upstream>()),
+        m_allocated(),
+        m_oversized(),
+        m_cached_oversized()
+    {
+        assert(m_options.validate());
+
+        pool p = { block_descriptor_ptr(), 0 };
+        m_pools.resize(detail::log2_ri(m_options.largest_block_size) - m_smallest_block_log2 + 1, p);
+    }
+
+    /*! Destructor. Releases all held memory to upstream.
+     */
+    ~unsynchronized_pool_resource()
+    {
+        release();
+    }
+
+private:
+    typedef typename Upstream::pointer void_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<char>::other char_ptr;
+
+    struct block_descriptor;
+    struct chunk_descriptor;
+    struct oversized_block_descriptor;
+
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<block_descriptor>::other block_descriptor_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<chunk_descriptor>::other chunk_descriptor_ptr;
+    typedef typename thrust::detail::pointer_traits<void_ptr>::template rebind<oversized_block_descriptor>::other oversized_block_descriptor_ptr;
+
+    struct block_descriptor
+    {
+        block_descriptor_ptr next;
+    };
+
+    struct chunk_descriptor
+    {
+        std::size_t size;
+        chunk_descriptor_ptr next;
+    };
+
+    // this was originally a forward list, but I made it a doubly linked list
+    // because that way deallocation when not caching is faster and doesn't require
+    // traversal of a linked list (it's still a forward list for the cached list,
+    // because allocation from that list already traverses)
+    //
+    // TODO: investigate whether it's better to have this be a doubly-linked list
+    // with fast do_deallocate when !m_options.cache_oversized, or to have this be
+    // a forward list and require traversal in do_deallocate
+    //
+    // I assume that it is better this way, but the additional pointer could
+    // potentially hurt? these are supposed to be oversized and/or overaligned,
+    // so they are kinda memory intensive already
+    struct oversized_block_descriptor
+    {
+        std::size_t size;
+        std::size_t alignment;
+        oversized_block_descriptor_ptr prev;
+        oversized_block_descriptor_ptr next;
+        oversized_block_descriptor_ptr next_cached;
+    };
+
+    struct pool
+    {
+        block_descriptor_ptr free_list;
+        std::size_t previous_allocated_count;
+    };
+
+    typedef thrust::host_vector<
+        pool,
+        allocator<pool, Upstream>
+    > pool_vector;
+
+    Upstream * m_upstream;
+
+    pool_options m_options;
+    std::size_t m_smallest_block_log2;
+
+    pool_vector m_pools;
+    chunk_descriptor_ptr m_allocated;
+    oversized_block_descriptor_ptr m_oversized;
+    oversized_block_descriptor_ptr m_cached_oversized;
+
+public:
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        // reset the buckets
+        for (std::size_t i = 0; i < m_pools.size(); ++i)
+        {
+            thrust::raw_reference_cast(m_pools[i]).free_list = block_descriptor_ptr();
+            thrust::raw_reference_cast(m_pools[i]).previous_allocated_count = 0;
+        }
+
+        // deallocate memory allocated for the buckets
+        while (detail::pointer_traits<chunk_descriptor_ptr>::get(m_allocated))
+        {
+            chunk_descriptor_ptr alloc = m_allocated;
+            m_allocated = thrust::raw_reference_cast(*m_allocated).next;
+
+            void_ptr p = static_cast<void_ptr>(
+                static_cast<char_ptr>(
+                    static_cast<void_ptr>(alloc)
+                ) - thrust::raw_reference_cast(*alloc).size
+            );
+            m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(chunk_descriptor), m_options.alignment);
+        }
+
+        // deallocate cached oversized/overaligned memory
+        while (detail::pointer_traits<oversized_block_descriptor_ptr>::get(m_oversized))
+        {
+            oversized_block_descriptor_ptr alloc = m_oversized;
+            m_oversized = thrust::raw_reference_cast(*m_oversized).next;
+
+            void_ptr p = static_cast<void_ptr>(
+                static_cast<char_ptr>(
+                    static_cast<void_ptr>(alloc)
+                ) - thrust::raw_reference_cast(*alloc).size
+            );
+            m_upstream->do_deallocate(p, thrust::raw_reference_cast(*alloc).size + sizeof(oversized_block_descriptor), thrust::raw_reference_cast(*alloc).alignment);
+        }
+
+        m_cached_oversized = oversized_block_descriptor_ptr();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        bytes = (std::max)(bytes, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // an oversized and/or overaligned allocation requested; needs to be allocated separately
+        if (bytes > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            if (m_options.cache_oversized)
+            {
+                oversized_block_descriptor_ptr ptr = m_cached_oversized;
+                oversized_block_descriptor_ptr * previous = &m_cached_oversized;
+                while (detail::pointer_traits<oversized_block_descriptor_ptr>::get(ptr))
+                {
+                    oversized_block_descriptor desc = *ptr;
+                    bool is_good = desc.size >= bytes && desc.alignment >= alignment;
+
+                    // if the size is bigger than the requested size by a factor
+                    // bigger than or equal to the specified cutoff for size,
+                    // allocate a new block
+                    if (is_good)
+                    {
+                        std::size_t size_factor = desc.size / bytes;
+                        if (size_factor >= m_options.cached_size_cutoff_factor)
+                        {
+                            is_good = false;
+                        }
+                    }
+
+                    // if the alignment is bigger than the requested one by a factor
+                    // bigger than or equal to the specified cutoff for alignment,
+                    // allocate a new block
+                    if (is_good)
+                    {
+                        std::size_t alignment_factor = desc.alignment / alignment;
+                        if (alignment_factor >= m_options.cached_alignment_cutoff_factor)
+                        {
+                            is_good = false;
+                        }
+                    }
+
+                    if (is_good)
+                    {
+                        if (previous != &m_cached_oversized)
+                        {
+                            oversized_block_descriptor previous_desc = **previous;
+                            previous_desc.next_cached = desc.next_cached;
+                            **previous = previous_desc;
+                        }
+                        else
+                        {
+                            m_cached_oversized = desc.next_cached;
+                        }
+
+                        desc.next_cached = oversized_block_descriptor_ptr();
+                        *ptr = desc;
+
+                        return static_cast<void_ptr>(
+                            static_cast<char_ptr>(
+                                static_cast<void_ptr>(ptr)
+                            ) - desc.size
+                        );
+                    }
+
+                    previous = &thrust::raw_reference_cast(*ptr).next_cached;
+                    ptr = *previous;
+                }
+            }
+
+            // no fitting cached block found; allocate a new one that's just up to the specs
+            void_ptr allocated = m_upstream->do_allocate(bytes + sizeof(oversized_block_descriptor), alignment);
+            oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(allocated) + bytes
+                )
+            );
+
+            oversized_block_descriptor desc;
+            desc.size = bytes;
+            desc.alignment = alignment;
+            desc.prev = oversized_block_descriptor_ptr();
+            desc.next = m_oversized;
+            desc.next_cached = oversized_block_descriptor_ptr();
+            *block = desc;
+            m_oversized = block;
+
+            if (detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.next))
+            {
+                oversized_block_descriptor next = *desc.next;
+                next.prev = block;
+                *desc.next = next;
+            }
+
+            return allocated;
+        }
+
+        // the request is NOT for oversized and/or overaligned memory
+        // allocate a block from an appropriate bucket
+        std::size_t bytes_log2 = thrust::detail::log2_ri(bytes);
+        std::size_t bucket_idx = bytes_log2 - m_smallest_block_log2;
+        pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
+
+        bytes = static_cast<std::size_t>(1) << bytes_log2;
+
+        // if the free list of the bucket has no elements, allocate a new chunk
+        // and split it into blocks pushed to the free list
+        if (!detail::pointer_traits<block_descriptor_ptr>::get(bucket.free_list))
+        {
+            std::size_t n = bucket.previous_allocated_count;
+            if (n == 0)
+            {
+                n = m_options.min_blocks_per_chunk;
+                if (n < (m_options.min_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.min_bytes_per_chunk >> bytes_log2;
+                }
+            }
+            else
+            {
+                n = n * 3 / 2;
+                if (n > (m_options.max_bytes_per_chunk >> bytes_log2))
+                {
+                    n = m_options.max_bytes_per_chunk >> bytes_log2;
+                }
+                if (n > m_options.max_blocks_per_chunk)
+                {
+                    n = m_options.max_blocks_per_chunk;
+                }
+            }
+
+            std::size_t descriptor_size = (std::max)(sizeof(block_descriptor), m_options.alignment);
+            std::size_t block_size = bytes + descriptor_size;
+            block_size += m_options.alignment - block_size % m_options.alignment;
+            std::size_t chunk_size = block_size * n;
+
+            void_ptr allocated = m_upstream->do_allocate(chunk_size + sizeof(chunk_descriptor), m_options.alignment);
+            chunk_descriptor_ptr chunk = static_cast<chunk_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(allocated) + chunk_size
+                )
+            );
+
+            chunk_descriptor chunk_desc;
+            chunk_desc.size = chunk_size;
+            chunk_desc.next = m_allocated;
+            *chunk = chunk_desc;
+            m_allocated = chunk;
+
+            for (std::size_t i = 0; i < n; ++i)
+            {
+                block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
+                    static_cast<void_ptr>(
+                        static_cast<char_ptr>(allocated) + block_size * i + bytes
+                    )
+                );
+
+                block_descriptor block_desc;
+                block_desc.next = bucket.free_list;
+                *block = block_desc;
+                bucket.free_list = block;
+            }
+        }
+
+        // allocate a block from the front of the bucket's free list
+        block_descriptor_ptr block = bucket.free_list;
+        bucket.free_list = thrust::raw_reference_cast(*block).next;
+        return static_cast<void_ptr>(
+            static_cast<char_ptr>(
+                static_cast<void_ptr>(block)
+            ) - bytes
+        );
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        n = (std::max)(n, m_options.smallest_block_size);
+        assert(detail::is_power_of_2(alignment));
+
+        // verify that the pointer is at least as aligned as claimed
+        assert(reinterpret_cast<detail::intmax_t>(detail::pointer_traits<void_ptr>::get(p)) % alignment == 0);
+
+        // the deallocated block is oversized and/or overaligned
+        if (n > m_options.largest_block_size || alignment > m_options.alignment)
+        {
+            oversized_block_descriptor_ptr block = static_cast<oversized_block_descriptor_ptr>(
+                static_cast<void_ptr>(
+                    static_cast<char_ptr>(p) + n
+                )
+            );
+
+            oversized_block_descriptor desc = *block;
+
+            if (m_options.cache_oversized)
+            {
+                desc.next_cached = m_cached_oversized;
+                *block = desc;
+                m_cached_oversized = block;
+
+                return;
+            }
+
+            if (!detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.prev))
+            {
+                assert(m_oversized == block);
+                m_oversized = desc.next;
+            }
+            else
+            {
+                oversized_block_descriptor prev = *desc.prev;
+                assert(prev.next == block);
+                prev.next = desc.next;
+                *desc.prev = prev;
+            }
+
+            if (detail::pointer_traits<oversized_block_descriptor_ptr>::get(desc.next))
+            {
+                oversized_block_descriptor next = *desc.next;
+                assert(next.prev == block);
+                next.prev = desc.prev;
+                *desc.next = next;
+            }
+
+            m_upstream->do_deallocate(p, desc.size + sizeof(oversized_block_descriptor), desc.alignment);
+
+            return;
+        }
+
+        // push the block to the front of the appropriate bucket's free list
+        std::size_t n_log2 = thrust::detail::log2_ri(n);
+        std::size_t bucket_idx = n_log2 - m_smallest_block_log2;
+        pool & bucket = thrust::raw_reference_cast(m_pools[bucket_idx]);
+
+        n = static_cast<std::size_t>(1) << n_log2;
+
+        block_descriptor_ptr block = static_cast<block_descriptor_ptr>(
+            static_cast<void_ptr>(
+                static_cast<char_ptr>(p) + n
+            )
+        );
+
+        block_descriptor desc;
+        desc.next = bucket.free_list;
+        *block = desc;
+        bucket.free_list = block;
+    }
+};
+
+/*! \} // memory_resources
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/pool_options.h b/thrust/mr/pool_options.h
new file mode 100644
index 000000000..13a8fe674
--- /dev/null
+++ b/thrust/mr/pool_options.h
@@ -0,0 +1,128 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief A type used by the pooling resource adaptors to fine-tune their
+ *  behavior.
+ */
+
+#pragma once
+
+#include <cstddef>
+
+#include <thrust/detail/integer_math.h>
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/config/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A type used for configuring pooling resource adaptors, to fine-tune their behavior and parameters.
+ */
+struct pool_options
+{
+    /*! The minimal number of blocks, i.e. pieces of memory handed off to the user from a pool of a given size, in a single
+     *      chunk allocated from upstream.
+     */
+    std::size_t min_blocks_per_chunk;
+    /*! The minimal number of bytes in a single chunk allocated from upstream.
+     */
+    std::size_t min_bytes_per_chunk;
+    /*! The maximal number of blocks, i.e. pieces of memory handed off to the user from a pool of a given size, in a single
+     *      chunk allocated from upstream.
+     */
+    std::size_t max_blocks_per_chunk;
+    /*! The maximal number of bytes in a single chunk allocated from upstream.
+     */
+    std::size_t max_bytes_per_chunk;
+
+    /*! The size of blocks in the smallest pool covered by the pool resource. All allocation requests below this size will
+     *      be rounded up to this size.
+     */
+    std::size_t smallest_block_size;
+    /*! The size of blocks in the largest pool covered by the pool resource. All allocation requests above this size will
+     *      be considered oversized, allocated directly from upstream (and not from a pool), and cached only of \p cache_oversized
+     *      is true.
+     */
+    std::size_t largest_block_size;
+
+    /*! The alignment of all blocks in internal pools of the pool resource. All allocation requests above this alignment
+     *      will be considered oversized, allocated directly from upstream (and not from a pool), and cached only of
+     *      \p cache_oversized is true.
+     */
+    std::size_t alignment;
+
+    /*! Decides whether oversized and overaligned blocks are cached for later use, or immediately return it to the upstream
+     *      resource.
+     */
+    bool cache_oversized;
+
+    /*! The size factor at which a cached allocation is considered too ridiculously oversized to use to fulfill an allocation
+     *      request. For instance: the user requests an allocation of size 1024 bytes. A block of size 32 * 1024 bytes is
+     *      cached. If \p cached_size_cutoff_factor is 32 or less, this block will be considered too big for that allocation
+     *      request.
+     */
+    std::size_t cached_size_cutoff_factor;
+    /*! The alignment factor at which a cached allocation is considered too ridiculously overaligned to use to fulfill an
+     *      allocation request. For instance: the user requests an allocation aligned to 32 bytes. A block aligned to 1024 bytes
+     *      is cached. If \p cached_size_cutoff_factor is 32 or less, this block will be considered too overaligned for that
+     *      allocation request.
+     */
+    std::size_t cached_alignment_cutoff_factor;
+
+    /*! Checks if the options are self-consistent.
+     *
+     *  /returns true if the options are self-consitent, false otherwise.
+     */
+    bool validate() const
+    {
+        if (!detail::is_power_of_2(smallest_block_size)) return false;
+        if (!detail::is_power_of_2(largest_block_size)) return false;
+        if (!detail::is_power_of_2(alignment)) return false;
+
+        if (max_bytes_per_chunk == 0 || max_blocks_per_chunk == 0) return false;
+        if (smallest_block_size == 0 || largest_block_size == 0) return false;
+
+        if (min_blocks_per_chunk > max_blocks_per_chunk) return false;
+        if (min_bytes_per_chunk > max_bytes_per_chunk) return false;
+
+        if (smallest_block_size > largest_block_size) return false;
+
+        if (min_blocks_per_chunk * smallest_block_size > max_bytes_per_chunk) return false;
+        if (min_blocks_per_chunk * largest_block_size > max_bytes_per_chunk) return false;
+
+        if (max_blocks_per_chunk * largest_block_size < min_bytes_per_chunk) return false;
+        if (max_blocks_per_chunk * smallest_block_size < min_bytes_per_chunk) return false;
+
+        if (alignment > smallest_block_size) return false;
+
+        return true;
+    }
+};
+
+/*! \} // memory_resources
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/mr/sync_pool.h b/thrust/mr/sync_pool.h
new file mode 100644
index 000000000..46c0e8441
--- /dev/null
+++ b/thrust/mr/sync_pool.h
@@ -0,0 +1,114 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file 
+ *  \brief A mutex-synchronized version of \p unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <mutex>
+
+#include <thrust/mr/pool.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! A mutex-synchronized version of \p unsynchronized_pool_resource. Uses \p std::mutex, and therefore requires C++11.
+ *
+ *  \tparam Upstream the type of memory resources that will be used for allocating memory
+ */
+template<typename Upstream>
+struct synchronized_pool_resource : public memory_resource<typename Upstream::pointer>
+{
+    typedef unsynchronized_pool_resource<Upstream> unsync_pool;
+    typedef std::lock_guard<std::mutex> lock_t;
+
+    typedef typename Upstream::pointer void_ptr;
+
+public:
+    /*! Get the default options for a pool. These are meant to be a sensible set of values for many use cases,
+     *      and as such, may be tuned in the future. This function is exposed so that creating a set of options that are
+     *      just a slight departure from the defaults is easy.
+     */
+    static pool_options get_default_options()
+    {
+        return unsync_pool::get_default_options();
+    }
+
+    /*! Constructor.
+     *
+     *  \param upstream the upstream memory resource for allocations
+     *  \param options pool options to use
+     */
+    synchronized_pool_resource(Upstream * upstream, pool_options options = get_default_options())
+        : upstream_pool(upstream, options)
+    {
+    }
+
+    /*! Constructor. The upstream resource is obtained by calling \p get_global_resource<Upstream>.
+     *
+     *  \param options pool options to use
+     */
+    synchronized_pool_resource(pool_options options = get_default_options())
+        : upstream_pool(get_global_resource<Upstream>(), options)
+    {
+    }
+
+    /*! Releases all held memory to upstream.
+     */
+    void release()
+    {
+        lock_t lock(mtx);
+        upstream_pool.release();
+    }
+
+    THRUST_NODISCARD virtual void_ptr do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        lock_t lock(mtx);
+        return upstream_pool.do_allocate(bytes, alignment);
+    }
+
+    virtual void do_deallocate(void_ptr p, std::size_t n, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+    {
+        lock_t lock(mtx);
+        upstream_pool.do_deallocate(p, n, alignment);
+    }
+
+private:
+    std::mutex mtx;
+    unsync_pool upstream_pool;
+};
+
+/*! \} // memory_resources
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/mr/tls_pool.h b/thrust/mr/tls_pool.h
new file mode 100644
index 000000000..8ee8127a3
--- /dev/null
+++ b/thrust/mr/tls_pool.h
@@ -0,0 +1,64 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file tls_pool.h
+ *  \brief A function wrapping a thread local instance of a \p unsynchronized_pool_resource.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/mr/pool.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+/*! \addtogroup memory_management Memory Management
+ *  \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_resources
+ *  \{
+ */
+
+/*! Potentially constructs, if not yet created, and then returns the address of a thread-local \p unsynchronized_pool_resource,
+ *
+ *  \tparam Upstream the template argument to the pool template
+ *  \param upstream the argument to the constructor, if invoked
+ */
+template<typename Upstream, typename Bookkeeper>
+__host__
+thrust::mr::unsynchronized_pool_resource<Upstream> & tls_pool(Upstream * upstream = NULL)
+{
+    static thread_local auto adaptor = [&]{
+        assert(upstream);
+        return thrust::mr::unsynchronized_pool_resource<Upstream>(upstream);
+    }();
+
+    return adaptor;
+}
+
+/*! \}
+ */
+
+} // end mr
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/mr/universal_memory_resource.h b/thrust/mr/universal_memory_resource.h
new file mode 100644
index 000000000..b7f1ebd6f
--- /dev/null
+++ b/thrust/mr/universal_memory_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/mr/device_memory_resource.h>
+
diff --git a/thrust/mr/validator.h b/thrust/mr/validator.h
new file mode 100644
index 000000000..10e964821
--- /dev/null
+++ b/thrust/mr/validator.h
@@ -0,0 +1,51 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/config/memory_resource.h>
+#include <thrust/mr/memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace mr
+{
+
+template<typename MR>
+struct validator
+{
+#if THRUST_CPP_DIALECT >= 2011
+  static_assert(
+    std::is_base_of<memory_resource<typename MR::pointer>, MR>::value,
+    "a type used as a memory resource must derive from memory_resource"
+  );
+#endif
+};
+
+template<typename T, typename U>
+struct validator2 : private validator<T>, private validator<U>
+{
+};
+
+template<typename T>
+struct validator2<T, T> : private validator<T>
+{
+};
+
+} // end mr
+THRUST_NAMESPACE_END
+
diff --git a/thrust/optional.h b/thrust/optional.h
new file mode 100644
index 000000000..a1ca4f465
--- /dev/null
+++ b/thrust/optional.h
@@ -0,0 +1,2876 @@
+///
+// optional - An implementation of std::optional with extensions
+// Written in 2017 by Sy Brand (@TartanLlama)
+//
+// To the extent possible under law, the author(s) have dedicated all
+// copyright and related and neighboring rights to this software to the
+// public domain worldwide. This software is distributed without any warranty.
+//
+// You should have received a copy of the CC0 Public Domain Dedication
+// along with this software. If not, see
+// <http://creativecommons.org/publicdomain/zero/1.0/>.
+///
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/type_traits.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/addressof.h>
+#include <thrust/swap.h>
+
+#define THRUST_OPTIONAL_VERSION_MAJOR 0
+#define THRUST_OPTIONAL_VERSION_MINOR 2
+
+#include <exception>
+#include <functional>
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER == 1900)
+#define THRUST_OPTIONAL_MSVC2015
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC49
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 4 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC54
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 5 && __GNUC_MINOR__ <= 5 &&              \
+     !defined(__clang__))
+#define THRUST_OPTIONAL_GCC55
+#endif
+
+#if (defined(__GNUC__) && __GNUC__ == 4 && __GNUC_MINOR__ <= 9 &&              \
+     !defined(__clang__))
+// GCC < 5 doesn't support overloading on const&& for member functions
+#define THRUST_OPTIONAL_NO_CONSTRR
+
+// GCC < 5 doesn't support some standard C++11 type traits
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+  std::has_trivial_copy_constructor<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) std::has_trivial_copy_assign<T>::value
+
+// GCC < 5 doesn't provide a way to emulate std::is_trivially_move_*,
+// so don't enable any optimizations that rely on them:
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) false
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) false
+
+// This one will be different for GCC 5.7 if it's ever supported
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
+
+// GCC 5 < v < 8 has a bug in is_trivially_copy_constructible which breaks std::vector
+// for non-copyable types
+#elif (defined(__GNUC__) && __GNUC__ < 8 &&                                                \
+     !defined(__clang__))
+#ifndef THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+#define THRUST_GCC_LESS_8_TRIVIALLY_COPY_CONSTRUCTIBLE_MUTEX
+THRUST_NAMESPACE_BEGIN
+  namespace detail {
+      template<class T>
+      struct is_trivially_copy_constructible : std::is_trivially_copy_constructible<T>{};
+#ifdef _GLIBCXX_VECTOR
+      template<class T, class A>
+      struct is_trivially_copy_constructible<std::vector<T,A>>
+          : std::is_trivially_copy_constructible<T>{};
+#endif
+  }
+THRUST_NAMESPACE_END
+#endif
+
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)                                     \
+    thrust::detail::is_trivially_copy_constructible<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T)                                        \
+  std::is_trivially_copy_assignable<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)                                     \
+  std::is_trivially_move_constructible<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)                                        \
+  std::is_trivially_move_assignable<T>::value
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) std::is_trivially_destructible<T>::value
+#else
+
+// To support clang + old libstdc++ without type traits, check for equivalent
+// clang built-ins and use them if present. See note above
+// is_trivially_copyable_impl in
+// thrust/type_traits/is_trivially_relocatable.h for more details.
+
+#ifndef __has_feature
+#define __has_feature(x) 0
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) \
+  std::is_trivially_copy_constructible<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+  __is_trivially_assignable(T&, T const&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) \
+  std::is_trivially_copy_assignable<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_constructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  __is_trivially_constructible(T, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) \
+  std::is_trivially_move_constructible<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_assignable)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  __is_trivially_assignable(T&, T&&)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T) \
+  std::is_trivially_move_assignable<T>::value
+#endif
+
+#if defined(__GLIBCXX__) && __has_feature(is_trivially_destructible)
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  __is_trivially_destructible(T)
+#else
+#define THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) \
+  std::is_trivially_destructible<T>::value
+#endif
+
+#endif
+
+#if THRUST_CPP_DIALECT > 2011
+#define THRUST_OPTIONAL_CPP14
+#endif
+
+// constexpr implies const in C++11, not C++14
+#if (THRUST_CPP_DIALECT == 2011 || defined(THRUST_OPTIONAL_MSVC2015) ||                \
+     defined(THRUST_OPTIONAL_GCC49))
+/// \exclude
+#define THRUST_OPTIONAL_CPP11_CONSTEXPR
+#else
+/// \exclude
+#define THRUST_OPTIONAL_CPP11_CONSTEXPR constexpr
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+#ifndef THRUST_MONOSTATE_INPLACE_MUTEX
+#define THRUST_MONOSTATE_INPLACE_MUTEX
+/// \brief Used to represent an optional with no data; essentially a bool
+class monostate {};
+
+/// \brief A tag type to tell optional to construct its value in-place
+struct in_place_t {
+  explicit in_place_t() = default;
+};
+/// \brief A tag to tell optional to construct its value in-place
+static constexpr in_place_t in_place{};
+#endif
+
+template <class T> class optional;
+
+/// \exclude
+namespace detail {
+#ifndef THRUST_TRAITS_MUTEX
+#define THRUST_TRAITS_MUTEX
+// C++14-style aliases for brevity
+template <class T> using remove_const_t = typename std::remove_const<T>::type;
+template <class T>
+using remove_reference_t = typename std::remove_reference<T>::type;
+template <class T> using decay_t = typename std::decay<T>::type;
+template <bool E, class T = void>
+using enable_if_t = typename std::enable_if<E, T>::type;
+template <bool B, class T, class F>
+using conditional_t = typename std::conditional<B, T, F>::type;
+
+// std::conjunction from C++17
+template <class...> struct conjunction : std::true_type {};
+template <class B> struct conjunction<B> : B {};
+template <class B, class... Bs>
+struct conjunction<B, Bs...>
+    : std::conditional<bool(B::value), conjunction<Bs...>, B>::type {};
+
+#if defined(_LIBCPP_VERSION) && THRUST_CPP_DIALECT == 2011
+#define THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+#endif
+
+// In C++11 mode, there's an issue in libc++'s std::mem_fn
+// which results in a hard-error when using it in a noexcept expression
+// in some cases. This is a check to workaround the common failing case.
+#ifdef THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+template <class T> struct is_pointer_to_non_const_member_func : std::false_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...)&&> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&> : std::true_type{};
+template <class T, class Ret, class... Args>
+struct is_pointer_to_non_const_member_func<Ret (T::*) (Args...) volatile&&> : std::true_type{};
+
+template <class T> struct is_const_or_const_ref : std::false_type{};
+template <class T> struct is_const_or_const_ref<T const&> : std::true_type{};
+template <class T> struct is_const_or_const_ref<T const> : std::true_type{};
+#endif
+
+// std::invoke from C++17
+// https://stackoverflow.com/questions/38288042/c11-14-invoke-workaround
+__thrust_exec_check_disable__
+template <typename Fn, typename... Args,
+#ifdef THRUST_OPTIONAL_LIBCXX_MEM_FN_WORKAROUND
+          typename = enable_if_t<!(is_pointer_to_non_const_member_func<Fn>::value
+                                 && is_const_or_const_ref<Args...>::value)>,
+#endif
+          typename = enable_if_t<std::is_member_pointer<decay_t<Fn>>::value>,
+          int = 0>
+__host__ __device__
+constexpr auto invoke(Fn &&f, Args &&... args)
+  noexcept(noexcept(std::mem_fn(f)(std::forward<Args>(args)...)))
+  THRUST_TRAILING_RETURN(decltype(std::mem_fn(f)(std::forward<Args>(args)...)))
+{
+  return std::mem_fn(f)(std::forward<Args>(args)...);
+}
+
+__thrust_exec_check_disable__
+template <typename Fn, typename... Args,
+          typename = enable_if_t<!std::is_member_pointer<decay_t<Fn>>::value>>
+__host__ __device__
+constexpr auto invoke(Fn &&f, Args &&... args)
+  noexcept(noexcept(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+  THRUST_TRAILING_RETURN(decltype(std::forward<Fn>(f)(std::forward<Args>(args)...)))
+{
+  return std::forward<Fn>(f)(std::forward<Args>(args)...);
+}
+#endif
+
+// std::void_t from C++17
+template <class...> struct voider { using type = void; };
+template <class... Ts> using void_t = typename voider<Ts...>::type;
+
+// Trait for checking if a type is a thrust::optional
+template <class T> struct is_optional_impl : std::false_type {};
+template <class T> struct is_optional_impl<optional<T>> : std::true_type {};
+template <class T> using is_optional = is_optional_impl<decay_t<T>>;
+
+// Change void to thrust::monostate
+template <class U>
+using fixup_void = conditional_t<std::is_void<U>::value, monostate, U>;
+
+template <class F, class U, class = invoke_result_t<F, U>>
+using get_map_return = optional<fixup_void<invoke_result_t<F, U>>>;
+
+// Check if invoking F for some Us returns void
+template <class F, class = void, class... U> struct returns_void_impl;
+template <class F, class... U>
+struct returns_void_impl<F, void_t<invoke_result_t<F, U...>>, U...>
+    : std::is_void<invoke_result_t<F, U...>> {};
+template <class F, class... U>
+using returns_void = returns_void_impl<F, void, U...>;
+
+template <class T, class... U>
+using enable_if_ret_void = enable_if_t<returns_void<T &&, U...>::value>;
+
+template <class T, class... U>
+using disable_if_ret_void = enable_if_t<!returns_void<T &&, U...>::value>;
+
+template <class T, class U>
+using enable_forward_value =
+    detail::enable_if_t<std::is_constructible<T, U &&>::value &&
+                        !std::is_same<detail::decay_t<U>, in_place_t>::value &&
+                        !std::is_same<optional<T>, detail::decay_t<U>>::value>;
+
+template <class T, class U, class Other>
+using enable_from_other = detail::enable_if_t<
+    std::is_constructible<T, Other>::value &&
+    !std::is_constructible<T, optional<U> &>::value &&
+    !std::is_constructible<T, optional<U> &&>::value &&
+    !std::is_constructible<T, const optional<U> &>::value &&
+    !std::is_constructible<T, const optional<U> &&>::value &&
+    !std::is_convertible<optional<U> &, T>::value &&
+    !std::is_convertible<optional<U> &&, T>::value &&
+    !std::is_convertible<const optional<U> &, T>::value &&
+    !std::is_convertible<const optional<U> &&, T>::value>;
+
+template <class T, class U>
+using enable_assign_forward = detail::enable_if_t<
+    !std::is_same<optional<T>, detail::decay_t<U>>::value &&
+    !detail::conjunction<std::is_scalar<T>,
+                         std::is_same<T, detail::decay_t<U>>>::value &&
+    std::is_constructible<T, U>::value && std::is_assignable<T &, U>::value>;
+
+template <class T, class U, class Other>
+using enable_assign_from_other = detail::enable_if_t<
+    std::is_constructible<T, Other>::value &&
+    std::is_assignable<T &, Other>::value &&
+    !std::is_constructible<T, optional<U> &>::value &&
+    !std::is_constructible<T, optional<U> &&>::value &&
+    !std::is_constructible<T, const optional<U> &>::value &&
+    !std::is_constructible<T, const optional<U> &&>::value &&
+    !std::is_convertible<optional<U> &, T>::value &&
+    !std::is_convertible<optional<U> &&, T>::value &&
+    !std::is_convertible<const optional<U> &, T>::value &&
+    !std::is_convertible<const optional<U> &&, T>::value &&
+    !std::is_assignable<T &, optional<U> &>::value &&
+    !std::is_assignable<T &, optional<U> &&>::value &&
+    !std::is_assignable<T &, const optional<U> &>::value &&
+    !std::is_assignable<T &, const optional<U> &&>::value>;
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// TODO make a version which works with MSVC
+template <class T, class U = T> struct is_swappable : std::true_type {};
+
+template <class T, class U = T> struct is_nothrow_swappable : std::true_type {};
+#else
+// https://stackoverflow.com/questions/26744589/what-is-a-proper-way-to-implement-is-swappable-to-test-for-the-swappable-concept
+namespace swap_adl_tests {
+// if swap ADL finds this then it would call std::swap otherwise (same
+// signature)
+struct tag {};
+
+template <class T> tag swap(T &, T &);
+template <class T, std::size_t N> tag swap(T (&a)[N], T (&b)[N]);
+
+// helper functions to test if an unqualified swap is possible, and if it
+// becomes std::swap
+template <class, class> std::false_type can_swap(...) noexcept(false);
+template <class T, class U,
+          class = decltype(swap(std::declval<T &>(), std::declval<U &>()))>
+std::true_type can_swap(int) noexcept(noexcept(swap(std::declval<T &>(),
+                                                    std::declval<U &>())));
+
+template <class, class> std::false_type uses_std(...);
+template <class T, class U>
+std::is_same<decltype(swap(std::declval<T &>(), std::declval<U &>())), tag>
+uses_std(int);
+
+template <class T>
+struct is_std_swap_noexcept
+    : std::integral_constant<bool,
+                             std::is_nothrow_move_constructible<T>::value &&
+                                 std::is_nothrow_move_assignable<T>::value> {};
+
+template <class T, std::size_t N>
+struct is_std_swap_noexcept<T[N]> : is_std_swap_noexcept<T> {};
+
+template <class T, class U>
+struct is_adl_swap_noexcept
+    : std::integral_constant<bool, noexcept(can_swap<T, U>(0))> {};
+} // namespace swap_adl_tests
+
+template <class T, class U = T>
+struct is_swappable
+    : std::integral_constant<
+          bool,
+          decltype(detail::swap_adl_tests::can_swap<T, U>(0))::value &&
+              (!decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value ||
+               (std::is_move_assignable<T>::value &&
+                std::is_move_constructible<T>::value))> {};
+
+template <class T, std::size_t N>
+struct is_swappable<T[N], T[N]>
+    : std::integral_constant<
+          bool,
+          decltype(detail::swap_adl_tests::can_swap<T[N], T[N]>(0))::value &&
+              (!decltype(
+                   detail::swap_adl_tests::uses_std<T[N], T[N]>(0))::value ||
+               is_swappable<T, T>::value)> {};
+
+template <class T, class U = T>
+struct is_nothrow_swappable
+    : std::integral_constant<
+          bool,
+          is_swappable<T, U>::value &&
+              ((decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value
+                    &&detail::swap_adl_tests::is_std_swap_noexcept<T>::value) ||
+               (!decltype(detail::swap_adl_tests::uses_std<T, U>(0))::value &&
+                    detail::swap_adl_tests::is_adl_swap_noexcept<T,
+                                                                 U>::value))> {
+};
+#endif
+
+// The storage base manages the actual storage, and correctly propagates
+// trivial destruction from T. This case is for when T is not trivially
+// destructible.
+template <class T, bool = ::std::is_trivially_destructible<T>::value>
+struct optional_storage_base {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base() noexcept
+      : m_dummy(), m_has_value(false) {}
+
+  __thrust_exec_check_disable__
+  template <class... U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
+      : m_value(std::forward<U>(u)...), m_has_value(true) {}
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ~optional_storage_base() {
+    if (m_has_value) {
+      m_value.~T();
+      m_has_value = false;
+    }
+  }
+
+  struct dummy {};
+  union {
+    dummy m_dummy;
+    T m_value;
+  };
+
+  bool m_has_value;
+};
+
+// This case is for when T is trivially destructible.
+template <class T> struct optional_storage_base<T, true> {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base() noexcept
+      : m_dummy(), m_has_value(false) {}
+
+  __thrust_exec_check_disable__
+  template <class... U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional_storage_base(in_place_t, U &&... u)
+      : m_value(std::forward<U>(u)...), m_has_value(true) {}
+
+  // No destructor, so this class is trivially destructible
+
+  struct dummy {};
+  union {
+    dummy m_dummy;
+    T m_value;
+  };
+
+  bool m_has_value = false;
+};
+
+// This base class provides some handy member functions which can be used in
+// further derived classes
+template <class T> struct optional_operations_base : optional_storage_base<T> {
+  using optional_storage_base<T>::optional_storage_base;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void hard_reset() noexcept {
+    get().~T();
+    this->m_has_value = false;
+  }
+
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  void construct(Args &&... args) noexcept {
+    new (thrust::addressof(this->m_value)) T(std::forward<Args>(args)...);
+    this->m_has_value = true;
+  }
+
+  __thrust_exec_check_disable__
+  template <class Opt>
+  __host__ __device__
+  void assign(Opt &&rhs) {
+    if (this->has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = std::forward<Opt>(rhs).get();
+      } else {
+        this->m_value.~T();
+        this->m_has_value = false;
+      }
+    }
+
+    if (rhs.has_value()) {
+      construct(std::forward<Opt>(rhs).get());
+    }
+  }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  bool has_value() const { return this->m_has_value; }
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &get() & { return this->m_value; }
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &get() const & { return this->m_value; }
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&get() && { return std::move(this->m_value); }
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &&get() const && { return std::move(this->m_value); }
+#endif
+};
+
+// This class manages conditionally having a trivial copy constructor
+// This specialization is for when T is trivially copy constructible
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T)>
+struct optional_copy_base : optional_operations_base<T> {
+  using optional_operations_base<T>::optional_operations_base;
+};
+
+// This specialization is for when T is not trivially copy constructible
+template <class T>
+struct optional_copy_base<T, false> : optional_operations_base<T> {
+  using optional_operations_base<T>::optional_operations_base;
+
+  __thrust_exec_check_disable__
+  optional_copy_base() = default;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_copy_base(const optional_copy_base &rhs) {
+    if (rhs.has_value()) {
+      this->construct(rhs.get());
+    } else {
+      this->m_has_value = false;
+    }
+  }
+
+  __thrust_exec_check_disable__
+  optional_copy_base(optional_copy_base &&rhs) = default;
+  __thrust_exec_check_disable__
+  optional_copy_base &operator=(const optional_copy_base &rhs) = default;
+  __thrust_exec_check_disable__
+  optional_copy_base &operator=(optional_copy_base &&rhs) = default;
+};
+
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T)>
+struct optional_move_base : optional_copy_base<T> {
+  using optional_copy_base<T>::optional_copy_base;
+};
+template <class T> struct optional_move_base<T, false> : optional_copy_base<T> {
+  using optional_copy_base<T>::optional_copy_base;
+
+  __thrust_exec_check_disable__
+  optional_move_base() = default;
+  __thrust_exec_check_disable__
+  optional_move_base(const optional_move_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_move_base(optional_move_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value) {
+    if (rhs.has_value()) {
+      this->construct(std::move(rhs.get()));
+    } else {
+      this->m_has_value = false;
+    }
+  }
+  __thrust_exec_check_disable__
+  optional_move_base &operator=(const optional_move_base &rhs) = default;
+  __thrust_exec_check_disable__
+  optional_move_base &operator=(optional_move_base &&rhs) = default;
+};
+
+// This class manages conditionally having a trivial copy assignment operator
+template <class T, bool = THRUST_OPTIONAL_IS_TRIVIALLY_COPY_ASSIGNABLE(T) &&
+                          THRUST_OPTIONAL_IS_TRIVIALLY_COPY_CONSTRUCTIBLE(T) &&
+                          THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T)>
+struct optional_copy_assign_base : optional_move_base<T> {
+  using optional_move_base<T>::optional_move_base;
+};
+
+template <class T>
+struct optional_copy_assign_base<T, false> : optional_move_base<T> {
+  using optional_move_base<T>::optional_move_base;
+
+  __thrust_exec_check_disable__
+  optional_copy_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_copy_assign_base(const optional_copy_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_copy_assign_base(optional_copy_assign_base &&rhs) = default;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_copy_assign_base &operator=(const optional_copy_assign_base &rhs) {
+    this->assign(rhs);
+    return *this;
+  }
+  __thrust_exec_check_disable__
+  optional_copy_assign_base &
+  operator=(optional_copy_assign_base &&rhs) = default;
+};
+
+template <class T,
+          bool = THRUST_OPTIONAL_IS_TRIVIALLY_DESTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_CONSTRUCTIBLE(T) &&
+                 THRUST_OPTIONAL_IS_TRIVIALLY_MOVE_ASSIGNABLE(T)>
+struct optional_move_assign_base : optional_copy_assign_base<T> {
+  using optional_copy_assign_base<T>::optional_copy_assign_base;
+};
+
+template <class T>
+struct optional_move_assign_base<T, false> : optional_copy_assign_base<T> {
+  using optional_copy_assign_base<T>::optional_copy_assign_base;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_move_assign_base(const optional_move_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base(optional_move_assign_base &&rhs) = default;
+
+  __thrust_exec_check_disable__
+  optional_move_assign_base &
+  operator=(const optional_move_assign_base &rhs) = default;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional_move_assign_base &
+  operator=(optional_move_assign_base &&rhs) noexcept(
+      std::is_nothrow_move_constructible<T>::value
+          &&std::is_nothrow_move_assignable<T>::value) {
+    this->assign(std::move(rhs));
+    return *this;
+  }
+};
+
+// optional_delete_ctor_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible
+template <class T, bool EnableCopy = std::is_copy_constructible<T>::value,
+          bool EnableMove = std::is_move_constructible<T>::value>
+struct optional_delete_ctor_base {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, true, false> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, false, true> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_ctor_base<T, false, false> {
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(const optional_delete_ctor_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base(optional_delete_ctor_base &&) noexcept = delete;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(const optional_delete_ctor_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_ctor_base &
+  operator=(optional_delete_ctor_base &&) noexcept = default;
+};
+
+// optional_delete_assign_base will conditionally delete copy and move
+// constructors depending on whether T is copy/move constructible + assignable
+template <class T,
+          bool EnableCopy = (std::is_copy_constructible<T>::value &&
+                             std::is_copy_assignable<T>::value),
+          bool EnableMove = (std::is_move_constructible<T>::value &&
+                             std::is_move_assignable<T>::value)>
+struct optional_delete_assign_base {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_assign_base<T, true, false> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = delete;
+};
+
+template <class T> struct optional_delete_assign_base<T, false, true> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = default;
+};
+
+template <class T> struct optional_delete_assign_base<T, false, false> {
+  __thrust_exec_check_disable__
+  optional_delete_assign_base() = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(const optional_delete_assign_base &) = default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base(optional_delete_assign_base &&) noexcept =
+      default;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(const optional_delete_assign_base &) = delete;
+  __thrust_exec_check_disable__
+  optional_delete_assign_base &
+  operator=(optional_delete_assign_base &&) noexcept = delete;
+};
+
+} // namespace detail
+
+/// \brief A tag type to represent an empty optional
+struct nullopt_t {
+  struct do_not_use {};
+  __host__ __device__
+  constexpr explicit nullopt_t(do_not_use, do_not_use) noexcept {}
+};
+/// \brief Represents an empty optional
+/// \synopsis static constexpr nullopt_t nullopt;
+///
+/// *Examples*:
+/// ```
+/// thrust::optional<int> a = thrust::nullopt;
+/// void foo (thrust::optional<int>);
+/// foo(thrust::nullopt); //pass an empty optional
+/// ```
+static constexpr nullopt_t nullopt{nullopt_t::do_not_use{},
+                                   nullopt_t::do_not_use{}};
+
+class bad_optional_access : public std::exception {
+public:
+  bad_optional_access() = default;
+  __host__
+  const char *what() const noexcept { return "Optional has no value"; }
+};
+
+/// An optional object is an object that contains the storage for another
+/// object and manages the lifetime of this contained object, if any. The
+/// contained object may be initialized after the optional object has been
+/// initialized, and may be destroyed before the optional object has been
+/// destroyed. The initialization state of the contained object is tracked by
+/// the optional object.
+template <class T>
+class optional : private detail::optional_move_assign_base<T>,
+                 private detail::optional_delete_ctor_base<T>,
+                 private detail::optional_delete_assign_base<T> {
+  using base = detail::optional_move_assign_base<T>;
+
+  static_assert(!std::is_same<T, in_place_t>::value,
+                "instantiation of optional with in_place_t is ill-formed");
+  static_assert(!std::is_same<detail::decay_t<T>, nullopt_t>::value,
+                "instantiation of optional with nullopt_t is ill-formed");
+
+public:
+// The different versions for C++14 and 11 are needed because deduced return
+// types are not SFINAE-safe. This provides better support for things like
+// generic lambdas. C.f.
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+#endif
+#else
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`.
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise the return value of
+  /// `std::invoke(std::forward<F>(f), value())` is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &&> and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &&> and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &&>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : result(nullopt);
+  }
+#endif
+#endif
+
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(optional_map_impl(std::declval<optional &>(),
+                                             std::declval<F &&>()))
+  map(F &&f) & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(optional_map_impl(std::declval<optional &&>(),
+                                             std::declval<F &&>()))
+  map(F &&f) && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(optional_map_impl(std::declval<const optional &>(),
+                              std::declval<F &&>()))
+  map(F &&f) const & {
+    return optional_map_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(optional_map_impl(std::declval<const optional &&>(),
+                              std::declval<F &&>()))
+  map(F &&f) const && {
+    return optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+  /// \brief Calls `f` if the optional is empty
+  /// \requires `std::invoke_result_t<F>` must be void or convertible to
+  /// `optional<T>`.
+  /// \effects If `*this` has a value, returns `*this`.
+  /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)` and returns
+  /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)()`.
+  ///
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) const & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise returns
+  /// `u`.
+  ///
+  /// \details If there is a value stored, then `f` is called with `**this`
+  /// and the value is returned. Otherwise `u` is returned.
+  ///
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise calls
+  /// `u` and returns the result.
+  ///
+  /// \details If there is a value stored, then `f` is
+  /// called with `**this` and the value is returned. Otherwise
+  /// `std::forward<U>(u)()` is returned.
+  ///
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u) &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+#endif
+
+  /// \return `u` if `*this` has a value, otherwise an empty optional.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr optional<typename std::decay<U>::type> conjunction(U &&u) const {
+    using result = optional<detail::decay_t<U>>;
+    return has_value() ? result{u} : result{nullopt};
+  }
+
+  /// \return `rhs` if `*this` is empty, otherwise the current value.
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+#endif
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+#endif
+
+  /// Takes the value out of the optional, leaving it empty
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+#endif
+
+  using value_type = T;
+
+  /// Constructs an optional that does not contain a value.
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  constexpr optional() noexcept = default;
+
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional(nullopt_t) noexcept {}
+
+  /// Copy constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(const optional &rhs) = default;
+
+  /// Move constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(optional &&rhs) = default;
+
+  /// Constructs the stored value in-place using the given arguments.
+  /// \group in_place
+  /// \synopsis template <class... Args> constexpr explicit optional(in_place_t, Args&&... args);
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  constexpr explicit optional(
+      detail::enable_if_t<std::is_constructible<T, Args...>::value, in_place_t>,
+      Args &&... args)
+      : base(in_place, std::forward<Args>(args)...) {}
+
+  /// \group in_place
+  /// \synopsis template <class U, class... Args>\nconstexpr explicit optional(in_place_t, std::initializer_list<U>&, Args&&... args);
+  __thrust_exec_check_disable__
+  template <class U, class... Args>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR explicit optional(
+      detail::enable_if_t<std::is_constructible<T, std::initializer_list<U> &,
+                                                Args &&...>::value,
+                          in_place_t>,
+      std::initializer_list<U> il, Args &&... args) {
+    this->construct(il, std::forward<Args>(args)...);
+  }
+
+  /// Constructs the stored value with `u`.
+  /// \synopsis template <class U=T> constexpr optional(U &&u);
+  __thrust_exec_check_disable__
+  template <
+      class U = T,
+      detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::enable_forward_value<T, U> * = nullptr>
+  __host__ __device__
+  constexpr optional(U &&u) : base(in_place, std::forward<U>(u)) {}
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <
+      class U = T,
+      detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr,
+      detail::enable_forward_value<T, U> * = nullptr>
+  __host__ __device__
+  constexpr explicit optional(U &&u) : base(in_place, std::forward<U>(u)) {}
+
+  /// Converting copy constructor.
+  /// \synopsis template <class U> optional(const optional<U> &rhs);
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, const U &> * = nullptr,
+      detail::enable_if_t<std::is_convertible<const U &, T>::value> * = nullptr>
+  __host__ __device__
+  optional(const optional<U> &rhs) {
+    this->construct(*rhs);
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class U, detail::enable_from_other<T, U, const U &> * = nullptr,
+            detail::enable_if_t<!std::is_convertible<const U &, T>::value> * =
+                nullptr>
+  __host__ __device__
+  explicit optional(const optional<U> &rhs) {
+    this->construct(*rhs);
+  }
+
+  /// Converting move constructor.
+  /// \synopsis template <class U> optional(optional<U> &&rhs);
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, U &&> * = nullptr,
+      detail::enable_if_t<std::is_convertible<U &&, T>::value> * = nullptr>
+  __host__ __device__
+  optional(optional<U> &&rhs) {
+    this->construct(std::move(*rhs));
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <
+      class U, detail::enable_from_other<T, U, U &&> * = nullptr,
+      detail::enable_if_t<!std::is_convertible<U &&, T>::value> * = nullptr>
+  __host__ __device__
+  explicit optional(optional<U> &&rhs) {
+    this->construct(std::move(*rhs));
+  }
+
+  /// Destroys the stored value if there is one.
+  __thrust_exec_check_disable__
+  ~optional() = default;
+
+  /// Assignment to empty.
+  ///
+  /// Destroys the current value if there is one.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional &operator=(nullopt_t) noexcept {
+    if (has_value()) {
+      this->m_value.~T();
+      this->m_has_value = false;
+    }
+
+    return *this;
+  }
+
+  /// Copy assignment.
+  ///
+  /// Copies the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(const optional &rhs) = default;
+
+  /// Move assignment.
+  ///
+  /// Moves the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(optional &&rhs) = default;
+
+  /// Assigns the stored value from `u`, destroying the old value if there was
+  /// one.
+  /// \synopsis optional &operator=(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T, detail::enable_assign_forward<T, U> * = nullptr>
+  __host__ __device__
+  optional &operator=(U &&u) {
+    if (has_value()) {
+      this->m_value = std::forward<U>(u);
+    } else {
+      this->construct(std::forward<U>(u));
+    }
+
+    return *this;
+  }
+
+  /// Converting copy assignment operator.
+  ///
+  /// Copies the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  /// \synopsis optional &operator=(const optional<U> & rhs);
+  __thrust_exec_check_disable__
+  template <class U,
+            detail::enable_assign_from_other<T, U, const U &> * = nullptr>
+  __host__ __device__
+  optional &operator=(const optional<U> &rhs) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = *rhs;
+      } else {
+        this->hard_reset();
+      }
+    }
+
+    if (rhs.has_value()) {
+      this->construct(*rhs);
+    }
+
+    return *this;
+  }
+
+  // TODO check exception guarantee
+  /// Converting move assignment operator.
+  ///
+  /// Moves the value from `rhs` if there is one. Otherwise resets the stored
+  /// value in `*this`.
+  /// \synopsis optional &operator=(optional<U> && rhs);
+  __thrust_exec_check_disable__
+  template <class U, detail::enable_assign_from_other<T, U, U> * = nullptr>
+  __host__ __device__
+  optional &operator=(optional<U> &&rhs) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        this->m_value = std::move(*rhs);
+      } else {
+        this->hard_reset();
+      }
+    }
+
+    if (rhs.has_value()) {
+      this->construct(std::move(*rhs));
+    }
+
+    return *this;
+  }
+
+  /// Constructs the value in-place, destroying the current one if there is
+  /// one.
+  /// \group emplace
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  T &emplace(Args &&... args) {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args");
+
+    *this = nullopt;
+    this->construct(std::forward<Args>(args)...);
+    return this->m_value;
+  }
+
+  /// \group emplace
+  /// \synopsis template <class U, class... Args>\nT& emplace(std::initializer_list<U> il, Args &&... args);
+  __thrust_exec_check_disable__
+  template <class U, class... Args>
+  __host__ __device__
+  detail::enable_if_t<
+      std::is_constructible<T, std::initializer_list<U> &, Args &&...>::value,
+      T &>
+  emplace(std::initializer_list<U> il, Args &&... args) {
+    *this = nullopt;
+    this->construct(il, std::forward<Args>(args)...);
+    return this->m_value;
+  }
+
+  /// Swaps this optional with the other.
+  ///
+  /// If neither optionals have a value, nothing happens.
+  /// If both have a value, the values are swapped.
+  /// If one has a value, it is moved to the other and the movee is left
+  /// valueless.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void
+  swap(optional &rhs) noexcept(std::is_nothrow_move_constructible<T>::value
+                                   &&detail::is_nothrow_swappable<T>::value) {
+    if (has_value()) {
+      if (rhs.has_value()) {
+        using thrust::swap;
+        swap(**this, *rhs);
+      } else {
+        new (addressof(rhs.m_value)) T(std::move(this->m_value));
+        this->m_value.T::~T();
+      }
+    } else if (rhs.has_value()) {
+      new (addressof(this->m_value)) T(std::move(rhs.m_value));
+      rhs.m_value.T::~T();
+    }
+  }
+
+  /// \return a pointer to the stored value
+  /// \requires a value is stored
+  /// \group pointer
+  /// \synopsis constexpr const T *operator->() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T *operator->() const {
+    return addressof(this->m_value);
+  }
+
+  /// \group pointer
+  /// \synopsis constexpr T *operator->();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T *operator->() {
+    return addressof(this->m_value);
+  }
+
+  /// \return the stored value
+  /// \requires a value is stored
+  /// \group deref
+  /// \synopsis constexpr T &operator*();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &operator*() & { return this->m_value; }
+
+  /// \group deref
+  /// \synopsis constexpr const T &operator*() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator*() const & { return this->m_value; }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&operator*() && {
+    return std::move(this->m_value);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &&operator*() const && { return std::move(this->m_value); }
+#endif
+
+  /// \return whether or not the optional has a value
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool has_value() const noexcept { return this->m_has_value; }
+
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr explicit operator bool() const noexcept {
+    return this->m_has_value;
+  }
+
+  /// \return the contained value if there is one, otherwise throws
+  /// [bad_optional_access]
+  /// \group value
+  /// \synopsis constexpr T &value();
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &value() & {
+    if (has_value())
+      return this->m_value;
+    throw bad_optional_access();
+  }
+  /// \group value
+  /// \synopsis constexpr const T &value() const;
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &value() const & {
+    if (has_value())
+      return this->m_value;
+    throw bad_optional_access();
+  }
+  /// \exclude
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &&value() && {
+    if (has_value())
+      return std::move(this->m_value);
+    throw bad_optional_access();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &&value() const && {
+    if (has_value())
+      return std::move(this->m_value);
+    throw bad_optional_access();
+  }
+#endif
+
+  /// \return the stored value if there is one, otherwise returns `u`
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr T value_or(U &&u) const & {
+    static_assert(std::is_copy_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be copy constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T value_or(U &&u) && {
+    static_assert(std::is_move_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be move constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// Destroys the stored value if one exists, making the optional empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void reset() noexcept {
+    if (has_value()) {
+      this->m_value.~T();
+      this->m_has_value = false;
+    }
+  }
+};
+
+/// \group relop
+/// \brief Compares two optional objects
+/// \details If both optionals contain a value, they are compared with `T`s
+/// relational operators. Otherwise `lhs` and `rhs` are equal only if they are
+/// both empty, and `lhs` is less than `rhs` only if `rhs` is empty and `lhs`
+/// is not.
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return lhs.has_value() == rhs.has_value() &&
+         (!lhs.has_value() || *lhs == *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return lhs.has_value() != rhs.has_value() ||
+         (lhs.has_value() && *lhs != *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const optional<T> &lhs,
+                                const optional<U> &rhs) {
+  return rhs.has_value() && (!lhs.has_value() || *lhs < *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const optional<T> &lhs,
+                                const optional<U> &rhs) {
+  return lhs.has_value() && (!rhs.has_value() || *lhs > *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return !lhs.has_value() || (rhs.has_value() && *lhs <= *rhs);
+}
+/// \group relop
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const optional<T> &lhs,
+                                 const optional<U> &rhs) {
+  return !rhs.has_value() || (lhs.has_value() && *lhs >= *rhs);
+}
+
+/// \group relop_nullopt
+/// \brief Compares an optional to a `nullopt`
+/// \details Equivalent to comparing the optional to an empty optional
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs, nullopt_t) noexcept {
+  return !lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator==(nullopt_t, const optional<T> &rhs) noexcept {
+  return !rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs, nullopt_t) noexcept {
+  return lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator!=(nullopt_t, const optional<T> &rhs) noexcept {
+  return rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator<(const optional<T> &, nullopt_t) noexcept {
+  return false;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator<(nullopt_t, const optional<T> &rhs) noexcept {
+  return rhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator<=(const optional<T> &lhs, nullopt_t) noexcept {
+  return !lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator<=(nullopt_t, const optional<T> &) noexcept {
+  return true;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator>(const optional<T> &lhs, nullopt_t) noexcept {
+  return lhs.has_value();
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator>(nullopt_t, const optional<T> &) noexcept {
+  return false;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator>=(const optional<T> &, nullopt_t) noexcept {
+  return true;
+}
+/// \group relop_nullopt
+__thrust_exec_check_disable__
+template <class T>
+__host__ __device__
+inline constexpr bool operator>=(nullopt_t, const optional<T> &rhs) noexcept {
+  return !rhs.has_value();
+}
+
+/// \group relop_t
+/// \brief Compares the optional with a value.
+/// \details If the optional has a value, it is compared with the other value
+/// using `T`s relational operators. Otherwise, the optional is considered
+/// less than the value.
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs == rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator==(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs == *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs != rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator!=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs != *rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs < rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs < *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs <= rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator<=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs <= *rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs > rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs > *rhs : true;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const optional<T> &lhs, const U &rhs) {
+  return lhs.has_value() ? *lhs >= rhs : false;
+}
+/// \group relop_t
+__thrust_exec_check_disable__
+template <class T, class U>
+__host__ __device__
+inline constexpr bool operator>=(const U &lhs, const optional<T> &rhs) {
+  return rhs.has_value() ? lhs >= *rhs : true;
+}
+
+/// \synopsis template <class T>\nvoid swap(optional<T> &lhs, optional<T> &rhs);
+__thrust_exec_check_disable__
+template <class T,
+          detail::enable_if_t<std::is_move_constructible<T>::value> * = nullptr,
+          detail::enable_if_t<detail::is_swappable<T>::value> * = nullptr>
+__host__ __device__
+void swap(optional<T> &lhs,
+          optional<T> &rhs) noexcept(noexcept(lhs.swap(rhs))) {
+  return lhs.swap(rhs);
+}
+
+namespace detail {
+struct i_am_secret {};
+} // namespace detail
+
+__thrust_exec_check_disable__
+template <class T = detail::i_am_secret, class U,
+          class Ret =
+              detail::conditional_t<std::is_same<T, detail::i_am_secret>::value,
+                                    detail::decay_t<U>, T>>
+__host__ __device__
+inline constexpr optional<Ret> make_optional(U &&v) {
+  return optional<Ret>(std::forward<U>(v));
+}
+
+__thrust_exec_check_disable__
+template <class T, class... Args>
+__host__ __device__
+inline constexpr optional<T> make_optional(Args &&... args) {
+  return optional<T>(in_place, std::forward<Args>(args)...);
+}
+__thrust_exec_check_disable__
+template <class T, class U, class... Args>
+__host__ __device__
+inline constexpr optional<T> make_optional(std::initializer_list<U> il,
+                                           Args &&... args) {
+  return optional<T>(in_place, il, std::forward<Args>(args)...);
+}
+
+#if THRUST_CPP_DIALECT >= 2017
+template <class T> optional(T)->optional<T>;
+#endif
+
+// Doxygen chokes on the trailing return types used below.
+#if !defined(THRUST_DOXYGEN)
+/// \exclude
+namespace detail {
+#ifdef THRUST_OPTIONAL_CPP14
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+constexpr auto optional_map_impl(Opt &&opt, F &&f) {
+  return opt.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt))
+             : optional<Ret>(nullopt);
+}
+
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+auto optional_map_impl(Opt &&opt, F &&f) {
+  if (opt.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
+    return make_optional(monostate{});
+  }
+
+  return optional<monostate>(nullopt);
+}
+#else
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<!std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+constexpr optional<Ret> optional_map_impl(Opt &&opt, F &&f) {
+  return opt.has_value()
+             ? detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt))
+             : optional<Ret>(nullopt);
+}
+
+__thrust_exec_check_disable__
+template <class Opt, class F,
+          class Ret = decltype(detail::invoke(std::declval<F>(),
+                                              *std::declval<Opt>())),
+          detail::enable_if_t<std::is_void<Ret>::value> * = nullptr>
+__host__ __device__
+auto optional_map_impl(Opt &&opt, F &&f) -> optional<monostate>
+{
+  if (opt.has_value()) {
+    detail::invoke(std::forward<F>(f), *std::forward<Opt>(opt));
+    return monostate{};
+  }
+
+  return nullopt;
+}
+#endif
+} // namespace detail
+#endif // !defined(THRUST_DOXYGEN)
+
+/// Specialization for when `T` is a reference. `optional<T&>` acts similarly
+/// to a `T*`, but provides more operations and shows intent more clearly.
+///
+/// *Examples*:
+///
+/// ```
+/// int i = 42;
+/// thrust::optional<int&> o = i;
+/// *o == 42; //true
+/// i = 12;
+/// *o = 12; //true
+/// &*o == &i; //true
+/// ```
+///
+/// Assignment has rebind semantics rather than assign-through semantics:
+///
+/// ```
+/// int j = 8;
+/// o = j;
+///
+/// &*o == &j; //true
+/// ```
+template <class T> class optional<T &> {
+public:
+// The different versions for C++14 and 11 are needed because deduced return
+// types are not SFINAE-safe. This provides better support for things like
+// generic lambdas. C.f.
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2017/p0826r0
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+#endif
+#else
+  /// \group and_then
+  /// Carries out some operation which returns an optional on the stored
+  /// object if there is one. \requires `std::invoke(std::forward<F>(f),
+  /// value())` returns a `std::optional<U>` for some `U`. \return Let `U` be
+  /// the result of `std::invoke(std::forward<F>(f), value())`. Returns a
+  /// `std::optional<U>`. The return value is empty if `*this` is empty,
+  /// otherwise the return value of `std::invoke(std::forward<F>(f), value())`
+  /// is returned.
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) & {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR detail::invoke_result_t<F, T &> and_then(F &&f) && {
+    using result = detail::invoke_result_t<F, T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const & {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group and_then
+  /// \synopsis template <class F>\nconstexpr auto and_then(F &&f) const &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr detail::invoke_result_t<F, const T &> and_then(F &&f) const && {
+    using result = detail::invoke_result_t<F, const T &>;
+    static_assert(detail::is_optional<result>::value,
+                  "F must return an optional");
+
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : result(nullopt);
+  }
+#endif
+#endif
+
+#if defined(THRUST_OPTIONAL_CPP14) && !defined(THRUST_OPTIONAL_GCC49) &&               \
+    !defined(THRUST_OPTIONAL_GCC54) && !defined(THRUST_OPTIONAL_GCC55)
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR auto map(F &&f) && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> constexpr auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr auto map(F &&f) const && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#else
+  /// \brief Carries out some operation on the stored object if there is one.
+  /// \return Let `U` be the result of `std::invoke(std::forward<F>(f),
+  /// value())`. Returns a `std::optional<U>`. The return value is empty if
+  /// `*this` is empty, otherwise an `optional<U>` is constructed from the
+  /// return value of `std::invoke(std::forward<F>(f), value())` and is
+  /// returned.
+  ///
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(detail::optional_map_impl(std::declval<optional &>(),
+                                                     std::declval<F &&>()))
+  map(F &&f) & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR decltype(detail::optional_map_impl(std::declval<optional &&>(),
+                                                     std::declval<F &&>()))
+  map(F &&f) && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(detail::optional_map_impl(std::declval<const optional &>(),
+                                      std::declval<F &&>()))
+  map(F &&f) const & {
+    return detail::optional_map_impl(*this, std::forward<F>(f));
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map
+  /// \synopsis template <class F> auto map(F &&f) const&&;
+  __thrust_exec_check_disable__
+  template <class F>
+  __host__ __device__
+  constexpr decltype(detail::optional_map_impl(std::declval<const optional &&>(),
+                                      std::declval<F &&>()))
+  map(F &&f) const && {
+    return detail::optional_map_impl(std::move(*this), std::forward<F>(f));
+  }
+#endif
+#endif
+
+  /// \brief Calls `f` if the optional is empty
+  /// \requires `std::invoke_result_t<F>` must be void or convertible to
+  /// `optional<T>`. \effects If `*this` has a value, returns `*this`.
+  /// Otherwise, if `f` returns `void`, calls `std::forward<F>(f)` and returns
+  /// `std::nullopt`. Otherwise, returns `std::forward<F>(f)()`.
+  ///
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T>
+  THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T>
+  THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) &&;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+
+  /// \group or_else
+  /// \synopsis template <class F> optional<T> or_else (F &&f) const &;
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const & {
+    if (has_value())
+      return *this;
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> THRUST_OPTIONAL_CPP11_CONSTEXPR or_else(F &&f) const & {
+    return has_value() ? *this : std::forward<F>(f)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::enable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    if (has_value())
+      return std::move(*this);
+
+    std::forward<F>(f)();
+    return nullopt;
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class F, detail::disable_if_ret_void<F> * = nullptr>
+  __host__ __device__
+  optional<T> or_else(F &&f) const && {
+    return has_value() ? std::move(*this) : std::forward<F>(f)();
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise returns
+  /// `u`.
+  ///
+  /// \details If there is a value stored, then `f` is called with `**this`
+  /// and the value is returned. Otherwise `u` is returned.
+  ///
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  U map_or(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u);
+  }
+#endif
+
+  /// \brief Maps the stored value with `f` if there is one, otherwise calls
+  /// `u` and returns the result.
+  ///
+  /// \details If there is a value stored, then `f` is
+  /// called with `**this` and the value is returned. Otherwise
+  /// `std::forward<U>(u)()` is returned.
+  ///
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u) &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const & {
+    return has_value() ? detail::invoke(std::forward<F>(f), **this)
+                       : std::forward<U>(u)();
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group map_or_else
+  /// \synopsis template <class F, class U>\nauto map_or_else(F &&f, U &&u)
+  /// const &&;
+  __thrust_exec_check_disable__
+  template <class F, class U>
+  __host__ __device__
+  detail::invoke_result_t<U> map_or_else(F &&f, U &&u) const && {
+    return has_value() ? detail::invoke(std::forward<F>(f), std::move(**this))
+                       : std::forward<U>(u)();
+  }
+#endif
+
+  /// \return `u` if `*this` has a value, otherwise an empty optional.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr optional<typename std::decay<U>::type> conjunction(U &&u) const {
+    using result = optional<detail::decay_t<U>>;
+    return has_value() ? result{u} : result{nullopt};
+  }
+
+  /// \return `rhs` if `*this` is empty, otherwise the current value.
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const & {
+    return has_value() ? *this : rhs;
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(const optional &rhs) && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(const optional &rhs) const && {
+    return has_value() ? std::move(*this) : rhs;
+  }
+#endif
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const & {
+    return has_value() ? *this : std::move(rhs);
+  }
+
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional disjunction(optional &&rhs) && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group disjunction
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional disjunction(optional &&rhs) const && {
+    return has_value() ? std::move(*this) : std::move(rhs);
+  }
+#endif
+
+  /// Takes the value out of the optional, leaving it empty
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const & {
+    optional ret = *this;
+    reset();
+    return ret;
+  }
+
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+
+#ifndef THRUST_OPTIONAL_NO_CONSTRR
+  /// \group take
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional take() const && {
+    optional ret = std::move(*this);
+    reset();
+    return ret;
+  }
+#endif
+
+  using value_type = T &;
+
+  /// Constructs an optional that does not contain a value.
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional() noexcept : m_value(nullptr) {}
+
+  /// \group ctor_empty
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr optional(nullopt_t) noexcept : m_value(nullptr) {}
+
+  /// Copy constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(const optional &rhs) noexcept = default;
+
+  /// Move constructor
+  ///
+  /// If `rhs` contains a value, the stored value is direct-initialized with
+  /// it. Otherwise, the constructed optional is empty.
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR optional(optional &&rhs) = default;
+
+  /// Constructs the stored value with `u`.
+  /// \synopsis template <class U=T> constexpr optional(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T,
+            detail::enable_if_t<!detail::is_optional<detail::decay_t<U>>::value>
+                * = nullptr>
+  __host__ __device__
+  constexpr optional(U &&u) : m_value(addressof(u)) {
+    static_assert(std::is_lvalue_reference<U>::value, "U must be an lvalue");
+  }
+
+  /// \exclude
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr explicit optional(const optional<U> &rhs) : optional(*rhs) {}
+
+  /// No-op
+  __thrust_exec_check_disable__
+  ~optional() = default;
+
+  /// Assignment to empty.
+  ///
+  /// Destroys the current value if there is one.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  optional &operator=(nullopt_t) noexcept {
+    m_value = nullptr;
+    return *this;
+  }
+
+  /// Copy assignment.
+  ///
+  /// Rebinds this optional to the referee of `rhs` if there is one. Otherwise
+  /// resets the stored value in `*this`.
+  __thrust_exec_check_disable__
+  optional &operator=(const optional &rhs) = default;
+
+  /// Rebinds this optional to `u`.
+  ///
+  /// \requires `U` must be an lvalue reference.
+  /// \synopsis optional &operator=(U &&u);
+  __thrust_exec_check_disable__
+  template <class U = T,
+            detail::enable_if_t<!detail::is_optional<detail::decay_t<U>>::value>
+                * = nullptr>
+  __host__ __device__
+  optional &operator=(U &&u) {
+    static_assert(std::is_lvalue_reference<U>::value, "U must be an lvalue");
+    m_value = addressof(u);
+    return *this;
+  }
+
+  /// Converting copy assignment operator.
+  ///
+  /// Rebinds this optional to the referee of `rhs` if there is one. Otherwise
+  /// resets the stored value in `*this`.
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  optional &operator=(const optional<U> &rhs) {
+    m_value = addressof(rhs.value());
+    return *this;
+  }
+
+  /// Constructs the value in-place, destroying the current one if there is
+  /// one.
+  ///
+  /// \group emplace
+  __thrust_exec_check_disable__
+  template <class... Args>
+  __host__ __device__
+  T &emplace(Args &&... args) noexcept {
+    static_assert(std::is_constructible<T, Args &&...>::value,
+                  "T must be constructible with Args");
+
+    *this = nullopt;
+    this->construct(std::forward<Args>(args)...);
+  }
+
+  /// Swaps this optional with the other.
+  ///
+  /// If neither optionals have a value, nothing happens.
+  /// If both have a value, the values are swapped.
+  /// If one has a value, it is moved to the other and the movee is left
+  /// valueless.
+  __thrust_exec_check_disable__
+  __host__ __device__
+  void swap(optional &rhs) noexcept { std::swap(m_value, rhs.m_value); }
+
+  /// \return a pointer to the stored value
+  /// \requires a value is stored
+  /// \group pointer
+  /// \synopsis constexpr const T *operator->() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T *operator->() const { return m_value; }
+
+  /// \group pointer
+  /// \synopsis constexpr T *operator->();
+  __thrust_exec_check_disable__
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T *operator->() { return m_value; }
+
+  /// \return the stored value
+  /// \requires a value is stored
+  /// \group deref
+  /// \synopsis constexpr T &operator*();
+  __thrust_exec_check_disable__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &operator*() { return *m_value; }
+
+  /// \group deref
+  /// \synopsis constexpr const T &operator*() const;
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr const T &operator*() const { return *m_value; }
+
+  /// \return whether or not the optional has a value
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr bool has_value() const noexcept { return m_value != nullptr; }
+
+  /// \group has_value
+  __thrust_exec_check_disable__
+  __host__ __device__
+  constexpr explicit operator bool() const noexcept {
+    return m_value != nullptr;
+  }
+
+  /// \return the contained value if there is one, otherwise throws
+  /// [bad_optional_access]
+  /// \group value
+  /// synopsis constexpr T &value();
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T &value() {
+    if (has_value())
+      return *m_value;
+    throw bad_optional_access();
+  }
+  /// \group value
+  /// \synopsis constexpr const T &value() const;
+  __host__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR const T &value() const {
+    if (has_value())
+      return *m_value;
+    throw bad_optional_access();
+  }
+
+  /// \return the stored value if there is one, otherwise returns `u`
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  constexpr T value_or(U &&u) const & {
+    static_assert(std::is_copy_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be copy constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// \group value_or
+  __thrust_exec_check_disable__
+  template <class U>
+  __host__ __device__
+  THRUST_OPTIONAL_CPP11_CONSTEXPR T value_or(U &&u) && {
+    static_assert(std::is_move_constructible<T>::value &&
+                      std::is_convertible<U &&, T>::value,
+                  "T must be move constructible and convertible from U");
+    return has_value() ? **this : static_cast<T>(std::forward<U>(u));
+  }
+
+  /// Destroys the stored value if one exists, making the optional empty
+  __thrust_exec_check_disable__
+  void reset() noexcept { m_value = nullptr; }
+
+private:
+  T *m_value;
+};
+
+THRUST_NAMESPACE_END
+
+namespace std {
+// TODO SFINAE
+template <class T> struct hash<THRUST_NS_QUALIFIER::optional<T>> {
+  __thrust_exec_check_disable__
+  __host__ __device__
+  ::std::size_t operator()(const THRUST_NS_QUALIFIER::optional<T> &o) const {
+    if (!o.has_value())
+      return 0;
+
+    return std::hash<THRUST_NS_QUALIFIER::detail::remove_const_t<T>>()(*o);
+  }
+};
+} // namespace std
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/pair.h b/thrust/pair.h
index 48da892c7..eb2138aaf 100644
--- a/thrust/pair.h
+++ b/thrust/pair.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <utility>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -119,8 +118,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>x.first == y.first && x.second == y.second</tt>.
  *  
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -133,8 +132,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>x.first < y.first || (!(y.first < x.first) && x.second < y.second)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -147,8 +146,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(x == y)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -161,8 +160,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>y < x</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -175,8 +174,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(y < x)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -189,8 +188,8 @@ template <typename T1, typename T2>
  *  \param y The second \p pair to compare.
  *  \return \c true if and only if <tt>!(x < y)</tt>.
  *
- *  \tparam T1 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
- *  \tparam T2 is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam T1 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
+ *  \tparam T2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  */
 template <typename T1, typename T2>
   inline __host__ __device__
@@ -228,7 +227,7 @@ template <typename T1, typename T2>
  *  \tparam N This parameter selects the member of interest.
  *  \tparam T A \c pair type of interest.
  */
-template<int N, typename T> struct tuple_element;
+template<size_t N, class T> struct tuple_element;
 
 
 /*! This convenience metafunction is included for compatibility with
@@ -277,7 +276,6 @@ template<typename Pair> struct tuple_size;
 /*! \} // utility
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/pair.inl>
-
diff --git a/thrust/partition.h b/thrust/partition.h
index 6b941f036..90768f246 100644
--- a/thrust/partition.h
+++ b/thrust/partition.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reordering
  *  \ingroup algorithms
@@ -48,7 +46,7 @@ namespace thrust
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
+ *  \p stable_partition, does guarantee to preserve the relative order.
  *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
@@ -61,10 +59,10 @@ namespace thrust
  *          the sequence of the elements which do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p partition to reorder a
  *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
@@ -90,7 +88,7 @@ namespace thrust
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -114,7 +112,7 @@ __host__ __device__
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
+ *  \p stable_partition, does guarantee to preserve the relative order.
  *
  *  \param first The beginning of the sequence to reorder.
  *  \param last The end of the sequence to reorder.
@@ -123,10 +121,10 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements which do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p partition to reorder a
  *  sequence so that even numbers precede odd numbers.
@@ -150,7 +148,7 @@ __host__ __device__
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -172,7 +170,7 @@ template<typename ForwardIterator,
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
+ *  \p stable_partition, does guarantee to preserve the relative order.
  *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
@@ -186,11 +184,11 @@ template<typename ForwardIterator,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
  *
@@ -218,7 +216,7 @@ template<typename ForwardIterator,
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -245,7 +243,7 @@ __host__ __device__
  *
  *  Note that the relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition, does guarantee to preserve the relative order.
+ *  \p stable_partition, does guarantee to preserve the relative order.
  *
  *  \param first The beginning of the sequence to reorder.
  *  \param last The end of the sequence to reorder.
@@ -255,11 +253,11 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The ranges <tt>[first,last)</tt> and <tt>[stencil, stencil + (last - first))</tt> shall not overlap.
  *
@@ -286,7 +284,7 @@ __host__ __device__
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partition
  *  \see \p stable_partition
  *  \see \p partition_copy
  */
@@ -299,7 +297,7 @@ template<typename ForwardIterator,
                             Predicate pred);
 
 
-/*! \p partition_copy differs from \ref partition only in that the reordered
+/*! \p partition_copy differs from \p partition only in that the reordered
  *  sequence is written to difference output sequences, rather than in place.
  *
  *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -321,12 +319,12 @@ template<typename ForwardIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input range shall not overlap with either output range.
  *
@@ -360,7 +358,7 @@ template<typename ForwardIterator,
  *
  *  \note The relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
  *
  *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
  *  \see \p stable_partition_copy
@@ -381,7 +379,7 @@ __host__ __device__
                    Predicate pred);
 
 
-/*! \p partition_copy differs from \ref partition only in that the reordered
+/*! \p partition_copy differs from \p partition only in that the reordered
  *  sequence is written to difference output sequences, rather than in place.
  *
  *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -399,12 +397,12 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input range shall not overlap with either output range.
  *
@@ -437,7 +435,7 @@ __host__ __device__
  *
  *  \note The relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
  *
  *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
  *  \see \p stable_partition_copy
@@ -455,7 +453,7 @@ template<typename InputIterator,
                    Predicate pred);
 
 
-/*! \p partition_copy differs from \ref partition only in that the reordered
+/*! \p partition_copy differs from \p partition only in that the reordered
  *  sequence is written to difference output sequences, rather than in place.
  *
  *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -479,13 +477,13 @@ template<typename InputIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -514,7 +512,7 @@ template<typename InputIterator,
  *
  *  \note The relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
  *
  *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
  *  \see \p stable_partition_copy
@@ -537,7 +535,7 @@ __host__ __device__
                    Predicate pred);
 
 
-/*! \p partition_copy differs from \ref partition only in that the reordered
+/*! \p partition_copy differs from \p partition only in that the reordered
  *  sequence is written to difference output sequences, rather than in place.
  *
  *  \p partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -557,13 +555,13 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -590,7 +588,7 @@ __host__ __device__
  *
  *  \note The relative order of elements in the two reordered sequences is not
  *  necessarily the same as it was in the original sequence. A different algorithm,
- *  \ref stable_partition_copy, does guarantee to preserve the relative order.
+ *  \p stable_partition_copy, does guarantee to preserve the relative order.
  *
  *  \see http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2008/n2569.pdf
  *  \see \p stable_partition_copy
@@ -610,7 +608,7 @@ template<typename InputIterator1,
                    Predicate pred);
 
 
-/*! \p stable_partition is much like \ref partition : it reorders the elements in the
+/*! \p stable_partition is much like \p partition : it reorders the elements in the
  *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
  *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
  *  it. The postcondition is that, for some iterator \p middle in the range
@@ -618,7 +616,7 @@ template<typename InputIterator1,
  *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
  *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
  *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
  *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
  *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
  *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
@@ -636,10 +634,10 @@ template<typename InputIterator1,
  *          the sequence of the elements which do not satisfy pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p stable_partition to reorder a
  *  sequence so that even numbers precede odd numbers using the \p thrust::host execution policy for parallelization:
@@ -665,7 +663,7 @@ template<typename InputIterator1,
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -679,7 +677,7 @@ __host__ __device__
                                    Predicate pred);
 
 
-/*! \p stable_partition is much like \ref partition : it reorders the elements in the
+/*! \p stable_partition is much like \p partition : it reorders the elements in the
  *  range <tt>[first, last)</tt> based on the function object \p pred, such that all of
  *  the elements that satisfy \p pred precede all of the elements that fail to satisfy
  *  it. The postcondition is that, for some iterator \p middle in the range
@@ -687,7 +685,7 @@ __host__ __device__
  *  range <tt>[first,middle)</tt> and \c false for every iterator \c i in the range
  *  <tt>[middle, last)</tt>. The return value of \p stable_partition is \c middle.
  *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
  *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
  *  <tt>[first, last)</tt>, and \c stencil_x and \c stencil_y are the stencil elements
  *  in corresponding positions within <tt>[stencil, stencil + (last - first))</tt>,
@@ -701,10 +699,10 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements which do not satisfy pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type,
  *          and \p ForwardIterator is mutable.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p stable_partition to reorder a
  *  sequence so that even numbers precede odd numbers.
@@ -728,7 +726,7 @@ __host__ __device__
  *  // A is now {2, 4, 6, 8, 10, 1, 3, 5, 7, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -749,7 +747,7 @@ template<typename ForwardIterator,
  *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
  *  The return value of \p stable_partition is \c middle.
  *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
  *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
  *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
  *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
@@ -766,11 +764,11 @@ template<typename ForwardIterator,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
@@ -798,7 +796,7 @@ template<typename ForwardIterator,
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -824,7 +822,7 @@ __host__ __device__
  *  in the range <tt>[stencil + (middle - first), stencil + (last - first))</tt>.
  *  The return value of \p stable_partition is \c middle.
  *
- *  \p stable_partition differs from \ref partition in that \p stable_partition is
+ *  \p stable_partition differs from \p partition in that \p stable_partition is
  *  guaranteed to preserve relative order. That is, if \c x and \c y are elements in
  *  <tt>[first, last)</tt>, such that <tt>pred(x) == pred(y)</tt>, and if \c x precedes
  *  \c y, then it will still be true after \p stable_partition that \c x precedes \c y.
@@ -837,11 +835,11 @@ __host__ __device__
  *  \return An iterator referring to the first element of the second partition, that is,
  *          the sequence of the elements whose stencil elements do not satisfy \p pred.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
@@ -868,7 +866,7 @@ __host__ __device__
  *  // S is unmodified
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_partition.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_partition
  *  \see \p partition
  *  \see \p stable_partition_copy
  */
@@ -881,7 +879,7 @@ template<typename ForwardIterator,
                                    Predicate pred);
 
 
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
  *  sequence is written to different output sequences, rather than in place.
  *
  *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -889,7 +887,7 @@ template<typename ForwardIterator,
  *  to the range beginning at \p out_true and all the elements that fail to satisfy it
  *  are copied to the range beginning at \p out_false.
  *
- *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy differs from \p partition_copy in that
  *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
  *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
  *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
@@ -909,12 +907,12 @@ template<typename ForwardIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -965,7 +963,7 @@ __host__ __device__
                           Predicate pred);
 
 
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
  *  sequence is written to different output sequences, rather than in place.
  *
  *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -973,7 +971,7 @@ __host__ __device__
  *  to the range beginning at \p out_true and all the elements that fail to satisfy it
  *  are copied to the range beginning at \p out_false.
  *
- *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy differs from \p partition_copy in that
  *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
  *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
  *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
@@ -989,12 +987,12 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type and \p InputIterator's \c value_type
  *          is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1041,7 +1039,7 @@ template<typename InputIterator,
                           Predicate pred);
 
 
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
  *  sequence is written to different output sequences, rather than in place.
  *
  *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -1050,7 +1048,7 @@ template<typename InputIterator,
  *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
  *  at \p out_false.
  *
- *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy differs from \p partition_copy in that
  *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
  *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
  *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
@@ -1071,13 +1069,13 @@ template<typename InputIterator,
  *          \p out_false.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1124,7 +1122,7 @@ __host__ __device__
                           Predicate pred);
 
 
-/*! \p stable_partition_copy differs from \ref stable_partition only in that the reordered
+/*! \p stable_partition_copy differs from \p stable_partition only in that the reordered
  *  sequence is written to different output sequences, rather than in place.
  *
  *  \p stable_partition_copy copies the elements <tt>[first, last)</tt> based on the
@@ -1133,7 +1131,7 @@ __host__ __device__
  *  and all the elements whose stencil element fails to satisfy it are copied to the range beginning
  *  at \p out_false.
  *
- *  \p stable_partition_copy differs from \ref partition_copy in that
+ *  \p stable_partition_copy differs from \p partition_copy in that
  *  \p stable_partition_copy is guaranteed to preserve relative order. That is, if
  *  \c x and \c y are elements in <tt>[first, last)</tt>, such that
  *  <tt>pred(x) == pred(y)</tt>, and if \c x precedes \c y, then it will still be true
@@ -1150,13 +1148,13 @@ __host__ __device__
  *          at \p out_true and <tt>p.second</tt> is the end of the output range beginning at
  *          \p out_false.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p OutputIterator1 and \p OutputIterator2's \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The input ranges shall not overlap with either output range.
  *
@@ -1226,9 +1224,9 @@ template<typename InputIterator1,
  *          and <tt>none_of(mid, last, pred)</tt> are both true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
  *
@@ -1279,9 +1277,9 @@ __host__ __device__
  *  \return An iterator \c mid such that <tt>all_of(first, mid, pred)</tt>
  *          and <tt>none_of(mid, last, pred)</tt> are both true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall be partitioned by \p pred.
  *
@@ -1345,9 +1343,9 @@ template<typename ForwardIterator, typename Predicate>
  *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *  
  *  \code
  *  #include <thrust/partition.h>
@@ -1395,9 +1393,9 @@ __host__ __device__
  *  \return \c true if the range <tt>[first, last)</tt> is partitioned with respect
  *          to \p pred, or if <tt>[first, last)</tt> is empty. \c false, otherwise.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *  
  *  \code
  *  #include <thrust/partition.h>
@@ -1432,8 +1430,7 @@ template<typename InputIterator, typename Predicate>
  *  \} // end reductions
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/partition.inl>
 
diff --git a/thrust/per_device_resource.h b/thrust/per_device_resource.h
new file mode 100644
index 000000000..a6d620f85
--- /dev/null
+++ b/thrust/per_device_resource.h
@@ -0,0 +1,102 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/system/detail/generic/per_device_resource.h>
+#include <thrust/system/detail/adl/per_device_resource.h>
+#include <thrust/mr/allocator.h>
+
+#include <thrust/detail/execution_policy.h>
+#include <thrust/mr/allocator.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! Returns a global instance of \p MR for the current device of the provided system.
+ *
+ *  \tparam MR type of a memory resource to get an instance from. Must be \p DefaultConstructible.
+ *  \param system execution policy for which the resource is requested.
+ *  \return a pointer to a global instance of \p MR for the current device.
+ */
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(const thrust::detail::execution_policy_base<DerivedPolicy> & system)
+{
+    using thrust::system::detail::generic::get_per_device_resource;
+
+    return get_per_device_resource<MR>(
+        thrust::detail::derived_cast(
+            thrust::detail::strip_const(system)));
+}
+
+/*! A helper allocator class that uses global per device instances of a given upstream memory resource. Requires the memory
+ *      resource to be default constructible.
+ *
+ *  \tparam T the type that will be allocated by this allocator.
+ *  \tparam MR the upstream memory resource to use for memory allocation. Must derive from
+ *      \p thrust::mr::memory_resource and must be \p final.
+ *  \tparam ExecutionPolicy the execution policy of the system to be used to retrieve the resource for the current device.
+ */
+template<typename T, typename Upstream, typename ExecutionPolicy>
+class per_device_allocator : public thrust::mr::allocator<T, Upstream>
+{
+    typedef thrust::mr::allocator<T, Upstream> base;
+
+public:
+    /*! The \p rebind metafunction provides the type of an \p per_device_allocator instantiated with another type.
+     *
+     *  \tparam U the other type to use for instantiation.
+     */
+    template<typename U>
+    struct rebind
+    {
+        /*! The typedef \p other gives the type of the rebound \p per_device_allocator.
+         */
+        typedef per_device_allocator<U, Upstream, ExecutionPolicy> other;
+    };
+
+    /*! Default constructor. Uses \p get_global_resource to get the global instance of \p Upstream and initializes the
+     *      \p allocator base subobject with that resource.
+     */
+    __host__
+    per_device_allocator() : base(get_per_device_resource<Upstream>(ExecutionPolicy()))
+    {
+    }
+
+    /*! Copy constructor. Copies the memory resource pointer. */
+    __host__ __device__
+    per_device_allocator(const per_device_allocator & other)
+        : base(other) {}
+
+    /*! Conversion constructor from an allocator of a different type. Copies the memory resource pointer. */
+    template<typename U>
+    __host__ __device__
+    per_device_allocator(const per_device_allocator<U, Upstream, ExecutionPolicy> & other)
+        : base(other) {}
+
+    /*! Destructor. */
+    __host__ __device__
+    ~per_device_allocator() {}
+};
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
diff --git a/thrust/random.h b/thrust/random.h
index c0e9e2282..7463620b7 100644
--- a/thrust/random.h
+++ b/thrust/random.h
@@ -35,9 +35,7 @@
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/random/normal_distribution.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup random Random Number Generation
  *  \{
@@ -116,5 +114,4 @@ using random::ranlux48;
 using random::taus88;
 using random::default_random_engine;
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/random/detail/discard_block_engine.inl b/thrust/random/detail/discard_block_engine.inl
index 759581d4c..31128e250 100644
--- a/thrust/random/detail/discard_block_engine.inl
+++ b/thrust/random/detail/discard_block_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,16 +14,20 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/discard_block_engine.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   discard_block_engine<Engine,p,r>
     ::discard_block_engine()
       : m_e(), m_n(0)
@@ -31,6 +35,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   discard_block_engine<Engine,p,r>
     ::discard_block_engine(result_type s)
       : m_e(s), m_n(0)
@@ -38,6 +43,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   discard_block_engine<Engine,p,r>
     ::discard_block_engine(const base_type &urng)
       : m_e(urng), m_n(0)
@@ -45,6 +51,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   void discard_block_engine<Engine,p,r>
     ::seed(void)
 {
@@ -54,6 +61,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   void discard_block_engine<Engine,p,r>
     ::seed(result_type s)
 {
@@ -63,6 +71,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   typename discard_block_engine<Engine,p,r>::result_type
     discard_block_engine<Engine,p,r>
       ::operator()(void)
@@ -82,6 +91,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   void discard_block_engine<Engine,p,r>
     ::discard(unsigned long long z)
 {
@@ -94,6 +104,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   const typename discard_block_engine<Engine,p,r>::base_type &
     discard_block_engine<Engine,p,r>
       ::base(void) const
@@ -152,6 +163,7 @@ template<typename Engine, size_t p, size_t r>
 
 
 template<typename Engine, size_t p, size_t r>
+  __host__ __device__
   bool discard_block_engine<Engine,p,r>
     ::equal(const discard_block_engine<Engine,p,r> &rhs) const
 {
@@ -180,6 +192,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 
 template<typename Engine, size_t p, size_t r>
+__host__ __device__
 bool operator==(const discard_block_engine<Engine,p,r> &lhs,
                 const discard_block_engine<Engine,p,r> &rhs)
 {
@@ -188,6 +201,7 @@ bool operator==(const discard_block_engine<Engine,p,r> &lhs,
 
 
 template<typename Engine, size_t p, size_t r>
+__host__ __device__
 bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
                 const discard_block_engine<Engine,p,r> &rhs)
 {
@@ -197,5 +211,5 @@ bool operator!=(const discard_block_engine<Engine,p,r> &lhs,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_congruential_engine.inl b/thrust/random/detail/linear_congruential_engine.inl
index 054ee1106..fa9fd7d0d 100644
--- a/thrust/random/detail/linear_congruential_engine.inl
+++ b/thrust/random/detail/linear_congruential_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,18 +14,22 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_congruential_engine.h>
 #include <thrust/random/detail/mod.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
   linear_congruential_engine<UIntType,a,c,m>
     ::linear_congruential_engine(result_type s)
 {
@@ -34,6 +38,7 @@ template<typename UIntType, UIntType a, UIntType c, UIntType m>
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
   void linear_congruential_engine<UIntType,a,c,m>
     ::seed(result_type s)
 {
@@ -46,6 +51,7 @@ template<typename UIntType, UIntType a, UIntType c, UIntType m>
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
   typename linear_congruential_engine<UIntType,a,c,m>::result_type
     linear_congruential_engine<UIntType,a,c,m>
       ::operator()(void)
@@ -56,6 +62,7 @@ template<typename UIntType, UIntType a, UIntType c, UIntType m>
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+  __host__ __device__
   void linear_congruential_engine<UIntType,a,c,m>
     ::discard(unsigned long long z)
 {
@@ -113,6 +120,7 @@ template<typename UIntType, UIntType a, UIntType c, UIntType m>
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+__host__ __device__
 bool linear_congruential_engine<UIntType,a,c,m>
   ::equal(const linear_congruential_engine<UIntType,a,c,m> &rhs) const
 {
@@ -130,6 +138,7 @@ bool operator==(const linear_congruential_engine<UIntType_,a_,c_,m_> &lhs,
 
 
 template<typename UIntType, UIntType a, UIntType c, UIntType m>
+__host__ __device__
 bool operator!=(const linear_congruential_engine<UIntType,a,c,m> &lhs,
                 const linear_congruential_engine<UIntType,a,c,m> &rhs)
 {
@@ -159,5 +168,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_congruential_engine_discard.h b/thrust/random/detail/linear_congruential_engine_discard.h
index 381595144..c8103d9dc 100644
--- a/thrust/random/detail/linear_congruential_engine_discard.h
+++ b/thrust/random/detail/linear_congruential_engine_discard.h
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/cstdint.h>
 #include <thrust/random/detail/mod.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -103,5 +104,5 @@ struct linear_congruential_engine_discard
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_feedback_shift_engine.inl b/thrust/random/detail/linear_feedback_shift_engine.inl
index 963871736..ac3ca8673 100644
--- a/thrust/random/detail/linear_feedback_shift_engine.inl
+++ b/thrust/random/detail/linear_feedback_shift_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,15 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_feedback_shift_engine.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
   linear_feedback_shift_engine<UIntType,w,k,q,s>
     ::linear_feedback_shift_engine(result_type value)
 {
@@ -30,6 +34,7 @@ template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
 } // end linear_feedback_shift_engine::linear_feedback_shift_engine()
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
   void linear_feedback_shift_engine<UIntType,w,k,q,s>
     ::seed(result_type value)
 {
@@ -37,6 +42,7 @@ template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
 } // end linear_feedback_shift_engine::seed()
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
   typename linear_feedback_shift_engine<UIntType,w,k,q,s>::result_type
     linear_feedback_shift_engine<UIntType,w,k,q,s>
       ::operator()(void)
@@ -49,6 +55,7 @@ template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
 
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
   void linear_feedback_shift_engine<UIntType,w,k,q,s>
     ::discard(unsigned long long z)
 {
@@ -109,6 +116,7 @@ template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
 
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+  __host__ __device__
   bool linear_feedback_shift_engine<UIntType,w,k,q,s>
     ::equal(const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs) const
 {
@@ -117,6 +125,7 @@ template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
 
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+__host__ __device__
 bool operator==(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
                 const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
 {
@@ -125,6 +134,7 @@ bool operator==(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
 
 
 template<typename UIntType, size_t w, size_t k, size_t q, size_t s>
+__host__ __device__
 bool operator!=(const linear_feedback_shift_engine<UIntType,w,k,q,s> &lhs,
                 const linear_feedback_shift_engine<UIntType,w,k,q,s> &rhs)
 {
@@ -154,5 +164,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/linear_feedback_shift_engine_wordmask.h b/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
index 6669350ea..73c8ae83e 100644
--- a/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
+++ b/thrust/random/detail/linear_feedback_shift_engine_wordmask.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -43,5 +44,5 @@ template<typename T, int w>
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/mod.h b/thrust/random/detail/mod.h
index ed6afcf03..f0637582d 100644
--- a/thrust/random/detail/mod.h
+++ b/thrust/random/detail/mod.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -34,7 +35,7 @@ template<typename T, T a, T c, T m, bool = (m == 0)>
   __host__ __device__
   T operator()(T x) const
   {
-    if(a == 1)
+    THRUST_IF_CONSTEXPR(a == 1)
     {
       x %= m;
     }
@@ -52,7 +53,7 @@ template<typename T, T a, T c, T m, bool = (m == 0)>
       }
     }
 
-    if(c != 0)
+    THRUST_IF_CONSTEXPR(c != 0)
     {
       const T d = m - x;
       if(d > c)
@@ -93,5 +94,5 @@ __host__ __device__
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/normal_distribution.inl b/thrust/random/detail/normal_distribution.inl
index 24e68355f..4b69bab21 100644
--- a/thrust/random/detail/normal_distribution.inl
+++ b/thrust/random/detail/normal_distribution.inl
@@ -1,6 +1,5 @@
 /*
- *
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,6 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/normal_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/detail/cstdint.h>
@@ -27,14 +30,14 @@
 #include <limits>
 #endif
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename RealType>
+  __host__ __device__
   normal_distribution<RealType>
     ::normal_distribution(RealType a, RealType b)
       :super_t(),m_param(a,b)
@@ -43,6 +46,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   normal_distribution<RealType>
     ::normal_distribution(const param_type &parm)
       :super_t(),m_param(parm)
@@ -51,6 +55,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   void normal_distribution<RealType>
     ::reset(void)
 {
@@ -60,6 +65,7 @@ template<typename RealType>
 
 template<typename RealType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename normal_distribution<RealType>::result_type
       normal_distribution<RealType>
         ::operator()(UniformRandomNumberGenerator &urng)
@@ -70,6 +76,7 @@ template<typename RealType>
 
 template<typename RealType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename normal_distribution<RealType>::result_type
       normal_distribution<RealType>
         ::operator()(UniformRandomNumberGenerator &urng,
@@ -80,6 +87,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   typename normal_distribution<RealType>::param_type
     normal_distribution<RealType>
       ::param(void) const
@@ -89,6 +97,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   void normal_distribution<RealType>
     ::param(const param_type &parm)
 {
@@ -97,15 +106,17 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   typename normal_distribution<RealType>::result_type
     normal_distribution<RealType>
       ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
 {
-  return -this->max();
+  return -this->max THRUST_PREVENT_MACRO_SUBSTITUTION ();
 } // end normal_distribution::min()
 
 
 template<typename RealType>
+  __host__ __device__
   typename normal_distribution<RealType>::result_type
     normal_distribution<RealType>
       ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
@@ -126,6 +137,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   typename normal_distribution<RealType>::result_type
     normal_distribution<RealType>
       ::mean(void) const
@@ -135,6 +147,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   typename normal_distribution<RealType>::result_type
     normal_distribution<RealType>
       ::stddev(void) const
@@ -144,6 +157,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   bool normal_distribution<RealType>
     ::equal(const normal_distribution &rhs) const
 {
@@ -200,6 +214,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+__host__ __device__
 bool operator==(const normal_distribution<RealType> &lhs,
                 const normal_distribution<RealType> &rhs)
 {
@@ -208,6 +223,7 @@ bool operator==(const normal_distribution<RealType> &lhs,
 
 
 template<typename RealType>
+__host__ __device__
 bool operator!=(const normal_distribution<RealType> &lhs,
                 const normal_distribution<RealType> &rhs)
 {
@@ -237,5 +253,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/normal_distribution_base.h b/thrust/random/detail/normal_distribution_base.h
index 6c11af62b..a42e80014 100644
--- a/thrust/random/detail/normal_distribution_base.h
+++ b/thrust/random/detail/normal_distribution_base.h
@@ -29,14 +29,13 @@
 #include <limits>
 #include <cmath>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace random
 {
 namespace detail
 {
 
-// this version samples the normal distribution directly 
+// this version samples the normal distribution directly
 // and uses the non-standard math function erfcinv
 template<typename RealType>
   class normal_distribution_nvcc
@@ -46,15 +45,15 @@ template<typename RealType>
     __host__ __device__
     RealType sample(UniformRandomNumberGenerator &urng, const RealType mean, const RealType stddev)
     {
-      typedef typename UniformRandomNumberGenerator::result_type uint_type;
-      const uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
+      using uint_type = typename UniformRandomNumberGenerator::result_type;
+      constexpr uint_type urng_range = UniformRandomNumberGenerator::max - UniformRandomNumberGenerator::min;
 
       // Constants for conversion
-      const RealType S1 = static_cast<RealType>(1) / urng_range;
-      const RealType S2 = S1 / 2;
+      constexpr RealType S1 = static_cast<RealType>(1. / static_cast<double>(urng_range));
+      constexpr RealType S2 = S1 / 2;
 
       RealType S3 = static_cast<RealType>(-1.4142135623730950488016887242097); // -sqrt(2)
-      
+
       // Get the integer value
       uint_type u = urng() - UniformRandomNumberGenerator::min;
 
@@ -77,7 +76,7 @@ template<typename RealType>
     void reset() {}
 };
 
-// this version samples the normal distribution using 
+// this version samples the normal distribution using
 // Marsaglia's "polar method"
 template<typename RealType>
   class normal_distribution_portable
@@ -136,7 +135,7 @@ template<typename RealType>
 template<typename RealType>
   struct normal_distribution_base
 {
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC && !defined(_NVHPC_CUDA)
   typedef normal_distribution_nvcc<RealType> type;
 #else
   typedef normal_distribution_portable<RealType> type;
@@ -145,5 +144,5 @@ template<typename RealType>
 
 } // end detail
 } // end random
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/random_core_access.h b/thrust/random/detail/random_core_access.h
index f03060e0a..a3e34e02b 100644
--- a/thrust/random/detail/random_core_access.h
+++ b/thrust/random/detail/random_core_access.h
@@ -16,8 +16,9 @@
 
 #pragma once
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -53,5 +54,5 @@ static bool equal(const EngineOrDistribution &lhs, const EngineOrDistribution &r
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/subtract_with_carry_engine.inl b/thrust/random/detail/subtract_with_carry_engine.inl
index 0aa1b44ed..21c22fe77 100644
--- a/thrust/random/detail/subtract_with_carry_engine.inl
+++ b/thrust/random/detail/subtract_with_carry_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,19 +14,23 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/linear_congruential_engine.h>
 #include <thrust/random/subtract_with_carry_engine.h>
 #include <thrust/random/detail/mod.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   subtract_with_carry_engine<UIntType,w,s,r>
     ::subtract_with_carry_engine(result_type value)
 {
@@ -35,6 +39,7 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   void subtract_with_carry_engine<UIntType,w,s,r>
     ::seed(result_type value)
 {
@@ -53,6 +58,7 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   typename subtract_with_carry_engine<UIntType,w,s,r>::result_type
     subtract_with_carry_engine<UIntType,w,s,r>
       ::operator()(void)
@@ -84,6 +90,7 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   void subtract_with_carry_engine<UIntType,w,s,r>
     ::discard(unsigned long long z)
 {
@@ -101,19 +108,19 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 {
   typedef std::basic_ostream<CharT,Traits> ostream_type;
   typedef typename ostream_type::ios_base     ios_base;
-                  
+
   const typename ios_base::fmtflags flags = os.flags();
   const CharT fill  = os.fill();
   const CharT space = os.widen(' ');
   os.flags(ios_base::dec | ios_base::fixed | ios_base::left);
   os.fill(space);
 
-  const UIntType long_lag = r;
-                                                          
+  const UIntType long_lag_ = r;
+
   for(size_t i = 0; i < r; ++i)
-    os << m_x[(i + m_k) % long_lag] << space;
+    os << m_x[(i + m_k) % long_lag_] << space;
   os << m_carry;
-                                                                          
+
   os.flags(flags);
   os.fill(fill);
   return os;
@@ -143,15 +150,16 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   bool subtract_with_carry_engine<UIntType,w,s,r>
     ::equal(const subtract_with_carry_engine<UIntType,w,s,r> &rhs) const
 {
-  const UIntType long_lag = r;
+  const UIntType long_lag_ = r;
 
   bool result = true;
   for(size_t i = 0; i < r; ++i)
   {
-    result &= (m_x[(i + m_k) % long_lag] == rhs.m_x[(i + rhs.m_k) % long_lag]);
+    result &= (m_x[(i + m_k) % long_lag_] == rhs.m_x[(i + rhs.m_k) % long_lag_]);
   }
 
   // XXX not sure if this last check is necessary
@@ -182,6 +190,7 @@ template<typename UIntType, size_t w, size_t s, size_t r,
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   bool operator==(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
                   const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
 {
@@ -190,6 +199,7 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 
 template<typename UIntType, size_t w, size_t s, size_t r>
+  __host__ __device__
   bool operator!=(const subtract_with_carry_engine<UIntType,w,s,r> &lhs,
                   const subtract_with_carry_engine<UIntType,w,s,r> &rhs)
 {
@@ -199,5 +209,5 @@ template<typename UIntType, size_t w, size_t s, size_t r>
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/uniform_int_distribution.inl b/thrust/random/detail/uniform_int_distribution.inl
index 47d342eef..064bfcc73 100644
--- a/thrust/random/detail/uniform_int_distribution.inl
+++ b/thrust/random/detail/uniform_int_distribution.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,18 +14,22 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/uniform_int_distribution.h>
 #include <thrust/random/uniform_real_distribution.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename IntType>
+  __host__ __device__
   uniform_int_distribution<IntType>
     ::uniform_int_distribution(IntType a, IntType b)
       :m_param(a,b)
@@ -34,6 +38,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   uniform_int_distribution<IntType>
     ::uniform_int_distribution(const param_type &parm)
       :m_param(parm)
@@ -42,6 +47,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   void uniform_int_distribution<IntType>
     ::reset(void)
 {
@@ -50,6 +56,7 @@ template<typename IntType>
 
 template<typename IntType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename uniform_int_distribution<IntType>::result_type
       uniform_int_distribution<IntType>
         ::operator()(UniformRandomNumberGenerator &urng)
@@ -60,6 +67,7 @@ template<typename IntType>
 
 template<typename IntType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename uniform_int_distribution<IntType>::result_type
       uniform_int_distribution<IntType>
         ::operator()(UniformRandomNumberGenerator &urng, const param_type &parm)
@@ -70,8 +78,8 @@ template<typename IntType>
 
   typedef typename thrust::detail::largest_available_float::type float_type;
 
-  const float_type real_min(parm.first);
-  const float_type real_max(parm.second);
+  const float_type real_min(static_cast<float_type>(parm.first));
+  const float_type real_max(static_cast<float_type>(parm.second));
 
   // add one to the right end of the interval because it is half-open
   // XXX adding 1.0 to a potentially large floating point number seems like a bad idea
@@ -82,6 +90,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   typename uniform_int_distribution<IntType>::result_type
     uniform_int_distribution<IntType>
       ::a(void) const
@@ -91,6 +100,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   typename uniform_int_distribution<IntType>::result_type
     uniform_int_distribution<IntType>
       ::b(void) const
@@ -100,6 +110,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   typename uniform_int_distribution<IntType>::param_type
     uniform_int_distribution<IntType>
       ::param(void) const
@@ -109,6 +120,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   void uniform_int_distribution<IntType>
     ::param(const param_type &parm)
 {
@@ -117,6 +129,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   typename uniform_int_distribution<IntType>::result_type
     uniform_int_distribution<IntType>
       ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
@@ -126,6 +139,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   typename uniform_int_distribution<IntType>::result_type
     uniform_int_distribution<IntType>
       ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
@@ -135,6 +149,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+  __host__ __device__
   bool uniform_int_distribution<IntType>
     ::equal(const uniform_int_distribution &rhs) const
 {
@@ -191,6 +206,7 @@ template<typename IntType>
 
 
 template<typename IntType>
+__host__ __device__
 bool operator==(const uniform_int_distribution<IntType> &lhs,
                 const uniform_int_distribution<IntType> &rhs)
 {
@@ -199,6 +215,7 @@ bool operator==(const uniform_int_distribution<IntType> &lhs,
 
 
 template<typename IntType>
+__host__ __device__
 bool operator!=(const uniform_int_distribution<IntType> &lhs,
                 const uniform_int_distribution<IntType> &rhs)
 {
@@ -228,5 +245,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/uniform_real_distribution.inl b/thrust/random/detail/uniform_real_distribution.inl
index aa880773b..119f82c1e 100644
--- a/thrust/random/detail/uniform_real_distribution.inl
+++ b/thrust/random/detail/uniform_real_distribution.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,16 +14,20 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/uniform_real_distribution.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 
 template<typename RealType>
+  __host__ __device__
   uniform_real_distribution<RealType>
     ::uniform_real_distribution(RealType a, RealType b)
       :m_param(a,b)
@@ -31,6 +35,7 @@ template<typename RealType>
 } // end uniform_real_distribution::uniform_real_distribution()
 
 template<typename RealType>
+  __host__ __device__
   uniform_real_distribution<RealType>
     ::uniform_real_distribution(const param_type &parm)
       :m_param(parm)
@@ -38,6 +43,7 @@ template<typename RealType>
 } // end uniform_real_distribution::uniform_real_distribution()
 
 template<typename RealType>
+  __host__ __device__
   void uniform_real_distribution<RealType>
     ::reset(void)
 {
@@ -45,6 +51,7 @@ template<typename RealType>
 
 template<typename RealType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename uniform_real_distribution<RealType>::result_type
       uniform_real_distribution<RealType>
         ::operator()(UniformRandomNumberGenerator &urng)
@@ -54,6 +61,7 @@ template<typename RealType>
 
 template<typename RealType>
   template<typename UniformRandomNumberGenerator>
+    __host__ __device__
     typename uniform_real_distribution<RealType>::result_type
       uniform_real_distribution<RealType>
         ::operator()(UniformRandomNumberGenerator &urng,
@@ -72,6 +80,7 @@ template<typename RealType>
 } // end uniform_real::operator()()
 
 template<typename RealType>
+  __host__ __device__
   typename uniform_real_distribution<RealType>::result_type
     uniform_real_distribution<RealType>
       ::a(void) const
@@ -80,6 +89,7 @@ template<typename RealType>
 } // end uniform_real::a()
 
 template<typename RealType>
+  __host__ __device__
   typename uniform_real_distribution<RealType>::result_type
     uniform_real_distribution<RealType>
       ::b(void) const
@@ -88,6 +98,7 @@ template<typename RealType>
 } // end uniform_real_distribution::b()
 
 template<typename RealType>
+  __host__ __device__
   typename uniform_real_distribution<RealType>::param_type
     uniform_real_distribution<RealType>
       ::param(void) const
@@ -96,6 +107,7 @@ template<typename RealType>
 } // end uniform_real_distribution::param()
 
 template<typename RealType>
+  __host__ __device__
   void uniform_real_distribution<RealType>
     ::param(const param_type &parm)
 {
@@ -103,6 +115,7 @@ template<typename RealType>
 } // end uniform_real_distribution::param()
 
 template<typename RealType>
+  __host__ __device__
   typename uniform_real_distribution<RealType>::result_type
     uniform_real_distribution<RealType>
       ::min THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
@@ -111,6 +124,7 @@ template<typename RealType>
 } // end uniform_real_distribution::min()
 
 template<typename RealType>
+  __host__ __device__
   typename uniform_real_distribution<RealType>::result_type
     uniform_real_distribution<RealType>
       ::max THRUST_PREVENT_MACRO_SUBSTITUTION (void) const
@@ -120,6 +134,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+  __host__ __device__
   bool uniform_real_distribution<RealType>
     ::equal(const uniform_real_distribution &rhs) const
 {
@@ -176,6 +191,7 @@ template<typename RealType>
 
 
 template<typename RealType>
+__host__ __device__
 bool operator==(const uniform_real_distribution<RealType> &lhs,
                 const uniform_real_distribution<RealType> &rhs)
 {
@@ -184,6 +200,7 @@ bool operator==(const uniform_real_distribution<RealType> &lhs,
 
 
 template<typename RealType>
+__host__ __device__
 bool operator!=(const uniform_real_distribution<RealType> &lhs,
                 const uniform_real_distribution<RealType> &rhs)
 {
@@ -213,5 +230,5 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/xor_combine_engine.inl b/thrust/random/detail/xor_combine_engine.inl
index 72670ce9c..c94821443 100644
--- a/thrust/random/detail/xor_combine_engine.inl
+++ b/thrust/random/detail/xor_combine_engine.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,17 +14,21 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/random/xor_combine_engine.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   xor_combine_engine<Engine1,s1,Engine2,s2>
     ::xor_combine_engine(void)
       :m_b1(),m_b2()
@@ -33,6 +37,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   xor_combine_engine<Engine1,s1,Engine2,s2>
     ::xor_combine_engine(const base1_type &urng1, const base2_type &urng2)
       :m_b1(urng1),m_b2(urng2)
@@ -41,6 +46,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   xor_combine_engine<Engine1,s1,Engine2,s2>
     ::xor_combine_engine(result_type s)
       :m_b1(s),m_b2(s)
@@ -49,6 +55,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   void xor_combine_engine<Engine1,s1,Engine2,s2>
     ::seed(void)
 {
@@ -58,6 +65,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   void xor_combine_engine<Engine1,s1,Engine2,s2>
     ::seed(result_type s)
 {
@@ -67,6 +75,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base1_type &
     xor_combine_engine<Engine1,s1,Engine2,s2>
       ::base1(void) const
@@ -76,6 +85,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   const typename xor_combine_engine<Engine1,s1,Engine2,s2>::base2_type &
     xor_combine_engine<Engine1,s1,Engine2,s2>
       ::base2(void) const
@@ -85,6 +95,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   typename xor_combine_engine<Engine1,s1,Engine2,s2>::result_type
     xor_combine_engine<Engine1,s1,Engine2,s2>
       ::operator()(void)
@@ -95,6 +106,7 @@ template<typename Engine1, size_t s1,
 
 template<typename Engine1, size_t s1,
          typename Engine2, size_t s2>
+  __host__ __device__
   void xor_combine_engine<Engine1, s1, Engine2, s2>
     ::discard(unsigned long long z)
 {
@@ -154,6 +166,7 @@ template<typename Engine1, size_t s1, typename Engine2, size_t s2>
 
 
 template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+  __host__ __device__
   bool xor_combine_engine<Engine1,s1,Engine2,s2>
     ::equal(const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs) const
 {
@@ -182,6 +195,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 
 template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+__host__ __device__
 bool operator==(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
                 const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
 {
@@ -190,6 +204,7 @@ bool operator==(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
 
 
 template<typename Engine1, size_t s1, typename Engine2, size_t s2>
+__host__ __device__
 bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
                 const xor_combine_engine<Engine1,s1,Engine2,s2> &rhs)
 {
@@ -199,5 +214,5 @@ bool operator!=(const xor_combine_engine<Engine1,s1,Engine2,s2> &lhs,
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/detail/xor_combine_engine_max.h b/thrust/random/detail/xor_combine_engine_max.h
index cfb5bdc83..0756ff9e0 100644
--- a/thrust/random/detail/xor_combine_engine_max.h
+++ b/thrust/random/detail/xor_combine_engine_max.h
@@ -16,13 +16,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/mpl/math.h>
 #include <limits>
 #include <cstddef>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -320,5 +321,5 @@ template<typename Engine1, size_t s1, typename Engine2, size_t s2, typename resu
 
 } // end random
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/random/discard_block_engine.h b/thrust/random/discard_block_engine.h
index 2d73649c2..88e115586 100644
--- a/thrust/random/discard_block_engine.h
+++ b/thrust/random/discard_block_engine.h
@@ -29,8 +29,7 @@
 #include <thrust/detail/cstdint.h>
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -246,7 +245,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::discard_block_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/discard_block_engine.inl>
 
diff --git a/thrust/random/linear_congruential_engine.h b/thrust/random/linear_congruential_engine.h
index 0dc72b3b1..dac03d90e 100644
--- a/thrust/random/linear_congruential_engine.h
+++ b/thrust/random/linear_congruential_engine.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <thrust/random/detail/linear_congruential_engine_discard.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -289,7 +288,7 @@ using random::linear_congruential_engine;
 using random::minstd_rand;
 using random::minstd_rand0;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/linear_congruential_engine.inl>
 
diff --git a/thrust/random/linear_feedback_shift_engine.h b/thrust/random/linear_feedback_shift_engine.h
index 90c572c9b..a46c6d8ab 100644
--- a/thrust/random/linear_feedback_shift_engine.h
+++ b/thrust/random/linear_feedback_shift_engine.h
@@ -35,8 +35,7 @@
 #include <cstddef> // for size_t
 #include <thrust/random/detail/random_core_access.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 
 namespace random
@@ -224,7 +223,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::linear_feedback_shift_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/linear_feedback_shift_engine.inl>
 
diff --git a/thrust/random/normal_distribution.h b/thrust/random/normal_distribution.h
index ac45e161a..36b985cb6 100644
--- a/thrust/random/normal_distribution.h
+++ b/thrust/random/normal_distribution.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/normal_distribution_base.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -269,7 +268,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::normal_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/normal_distribution.inl>
 
diff --git a/thrust/random/subtract_with_carry_engine.h b/thrust/random/subtract_with_carry_engine.h
index 0b12ca353..69ee841fd 100644
--- a/thrust/random/subtract_with_carry_engine.h
+++ b/thrust/random/subtract_with_carry_engine.h
@@ -28,8 +28,7 @@
 #include <cstddef> // for size_t
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -250,7 +249,7 @@ using random::subtract_with_carry_engine;
 using random::ranlux24_base;
 using random::ranlux48_base;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/subtract_with_carry_engine.inl>
 
diff --git a/thrust/random/uniform_int_distribution.h b/thrust/random/uniform_int_distribution.h
index 42d745781..18f369fc2 100644
--- a/thrust/random/uniform_int_distribution.h
+++ b/thrust/random/uniform_int_distribution.h
@@ -27,8 +27,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -110,7 +109,8 @@ template<typename IntType = int>
      *           the platform.
      */
     __host__ __device__
-    explicit uniform_int_distribution(IntType a = 0, IntType b = thrust::detail::integer_traits<IntType>::const_max);
+    explicit uniform_int_distribution(IntType a = 0,
+                                      IntType b = THRUST_NS_QUALIFIER::detail::integer_traits<IntType>::const_max);
 
     /*! This constructor creates a new \p uniform_int_distribution from a \p param_type object
      *  encapsulating the range of the distribution.
@@ -270,7 +270,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::uniform_int_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/uniform_int_distribution.inl>
 
diff --git a/thrust/random/uniform_real_distribution.h b/thrust/random/uniform_real_distribution.h
index 312104570..e6c5a7d88 100644
--- a/thrust/random/uniform_real_distribution.h
+++ b/thrust/random/uniform_real_distribution.h
@@ -26,8 +26,7 @@
 #include <thrust/random/detail/random_core_access.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -268,7 +267,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 
 using random::uniform_real_distribution;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/uniform_real_distribution.inl>
 
diff --git a/thrust/random/xor_combine_engine.h b/thrust/random/xor_combine_engine.h
index d5e86b7a9..321f04033 100644
--- a/thrust/random/xor_combine_engine.h
+++ b/thrust/random/xor_combine_engine.h
@@ -29,8 +29,7 @@
 #include <iostream>
 #include <cstddef> // for size_t
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace random
 {
@@ -265,7 +264,7 @@ operator>>(std::basic_istream<CharT,Traits> &is,
 // import names into thrust::
 using random::xor_combine_engine;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/random/detail/xor_combine_engine.inl>
 
diff --git a/thrust/reduce.h b/thrust/reduce.h
index 08ad84b18..c7b378f72 100644
--- a/thrust/reduce.h
+++ b/thrust/reduce.h
@@ -15,7 +15,7 @@
  */
 
 
-/*! \file reduce.h
+/*! \file thrust/reduce.h
  *  \brief Functions for reducing a range to a single value
  */
 
@@ -26,9 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -58,7 +56,7 @@ namespace thrust
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
  *          \c value_type. If \c T is \c InputIterator's \c value_type, then
@@ -77,7 +75,7 @@ namespace thrust
  *  // result == 9
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename DerivedPolicy, typename InputIterator>
 __host__ __device__
@@ -104,7 +102,7 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p InputIterator's
  *          \c value_type. If \c T is \c InputIterator's \c value_type, then
@@ -122,7 +120,7 @@ __host__ __device__
  *  // result == 9
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename InputIterator> typename
   thrust::iterator_traits<InputIterator>::value_type reduce(InputIterator first, InputIterator last);
@@ -152,7 +150,7 @@ template<typename InputIterator> typename
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p T.
  *  \tparam T is convertible to \p InputIterator's \c value_type.
@@ -171,7 +169,7 @@ template<typename InputIterator> typename
  *  // result == 10
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename DerivedPolicy, typename InputIterator, typename T>
 __host__ __device__
@@ -201,7 +199,7 @@ __host__ __device__
  *  \param init The initial value.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and if \c x and \c y are objects of \p InputIterator's \c value_type,
  *          then <tt>x + y</tt> is defined and is convertible to \p T.
  *  \tparam T is convertible to \p InputIterator's \c value_type.
@@ -218,7 +216,7 @@ __host__ __device__
  *  // result == 10
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  */
 template<typename InputIterator, typename T>
   T reduce(InputIterator first,
@@ -251,11 +249,11 @@ template<typename InputIterator, typename T>
  *  \return The result of the reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p reduce to
@@ -275,7 +273,7 @@ template<typename InputIterator, typename T>
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  *  \see transform_reduce
  */
 template<typename DerivedPolicy,
@@ -311,11 +309,11 @@ __host__ __device__
  *  \param binary_op The binary function used to 'sum' values.
  *  \return The result of the reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *          and \c InputIterator's \c value_type is convertible to \c T.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p reduce to
@@ -332,7 +330,7 @@ __host__ __device__
  *  // result == 3
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/accumulate.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/accumulate
  *  \see transform_reduce
  */
 template<typename InputIterator,
@@ -364,11 +362,11 @@ template<typename InputIterator,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -430,11 +428,11 @@ __host__ __device__
  *  \param values_output The beginning of the output value range.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -496,13 +494,13 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -567,13 +565,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -641,14 +639,14 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -721,14 +719,14 @@ __host__ __device__
  *  \param binary_op The binary function used to accumulate values.
  *  \return A pair of iterators at end of the ranges <tt>[keys_output, keys_output_last)</tt> and <tt>[values_output, values_output_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *          and \c BinaryFunction's \c result_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -778,8 +776,6 @@ template<typename InputIterator1,
 /*! \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/reduce.inl>
-
diff --git a/thrust/remove.h b/thrust/remove.h
index 61e6b0a6b..a57fcf211 100644
--- a/thrust/remove.h
+++ b/thrust/remove.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup stream_compaction Stream Compaction
  *  \ingroup reordering
@@ -54,9 +52,9 @@ namespace thrust
  *          elements which are not equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p remove to remove a number
@@ -82,12 +80,12 @@ namespace thrust
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove_if
  *  \see remove_copy
  *  \see remove_copy_if
@@ -117,9 +115,9 @@ __host__ __device__
  *  \return A \p ForwardIterator pointing to the end of the resulting range of
  *          elements which are not equal to \p value.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p remove to remove a number
@@ -144,12 +142,12 @@ __host__ __device__
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove(S.begin(), S.end(), x), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove_if
  *  \see remove_copy
  *  \see remove_copy_if
@@ -179,10 +177,10 @@ template<typename ForwardIterator,
  *          which are not equal to \p value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -203,7 +201,7 @@ template<typename ForwardIterator,
  *  // result is now {-2, -1, 1, 2}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_if
  *  \see remove_copy_if
@@ -234,10 +232,10 @@ __host__ __device__
  *  \return An OutputIterator pointing to the end of the resulting range of elements
  *          which are not equal to \p value.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable">Equality Comparable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          and objects of type \p T can be compared for equality with objects of \p InputIterator's \c value_type.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -256,7 +254,7 @@ __host__ __device__
  *  // result is now {-2, -1, 1, 2}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_if
  *  \see remove_copy_if
@@ -290,10 +288,10 @@ template<typename InputIterator,
  *          elements for which \p pred evaluated to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  all even numbers from an array of integers using the \p thrust::host execution policy for
@@ -329,12 +327,12 @@ template<typename InputIterator,
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -365,10 +363,10 @@ __host__ __device__
  *  \return A ForwardIterator pointing to the end of the resulting range of
  *          elements for which \p pred evaluated to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p remove_if to remove
  *  all even numbers from an array of integers.
@@ -402,12 +400,12 @@ __host__ __device__
  *  range after elements have been removed from it; it follows that the elements
  *  after that iterator are of no interest, and may be discarded. If you are
  *  removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a>, you may
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a>, you may
  *  simply erase them. That is, a reasonable way of removing elements from a
- *  <a href="http://www.sgi.com/tech/stl/Sequence.html">Sequence</a> is
+ *  <a href="https://en.cppreference.com/w/cpp/container">Sequence</a> is
  *  <tt>S.erase(remove_if(S.begin(), S.end(), pred), S.end())</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -438,11 +436,11 @@ template<typename ForwardIterator,
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -471,7 +469,7 @@ template<typename ForwardIterator,
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -503,11 +501,11 @@ __host__ __device__
  *              to the resulting sequence.
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -534,7 +532,7 @@ __host__ __device__
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -569,11 +567,11 @@ template<typename InputIterator,
  *          elements for which \p pred evaluated to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -590,14 +588,14 @@ template<typename InputIterator,
  *  int A[N] = {1, 4, 2, 8, 5, 7};
  *  int S[N] = {0, 1, 1, 1, 0, 0};
  *
- *  int *new_end = thrust::remove(thrust::host, A, A + N, S, thrust::identity<int>());
+ *  int *new_end = thrust::remove_if(thrust::host, A, A + N, S, thrust::identity<int>());
  *  // The first three values of A are now {1, 5, 7}
  *  // Values beyond new_end are unspecified
  *  \endcode
  *
  *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -631,11 +629,11 @@ __host__ __device__
  *  \return A ForwardIterator pointing to the end of the resulting range of
  *          elements for which \p pred evaluated to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/ForwardIterator.html">Forward Iterator</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
@@ -650,14 +648,14 @@ __host__ __device__
  *  int A[N] = {1, 4, 2, 8, 5, 7};
  *  int S[N] = {0, 1, 1, 1, 0, 0};
  *
- *  int *new_end = thrust::remove(A, A + N, S, thrust::identity<int>());
+ *  int *new_end = thrust::remove_if(A, A + N, S, thrust::identity<int>());
  *  // The first three values of A are now {1, 5, 7}
  *  // Values beyond new_end are unspecified
  *  \endcode
  *
  *  \note The range <tt>[first, last)</tt> is not permitted to overlap with the range <tt>[stencil, stencil + (last - first))</tt>.
  *
- *  \see http://www.sgi.com/tech/stl/remove_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove
  *  \see remove
  *  \see remove_copy
  *  \see remove_copy_if
@@ -692,12 +690,12 @@ template<typename ForwardIterator,
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -718,7 +716,7 @@ template<typename ForwardIterator,
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -755,12 +753,12 @@ __host__ __device__
  *              to the resulting sequence.
  *  \return An OutputIterator pointing to the end of the resulting range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertible to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt>.
  *
@@ -779,7 +777,7 @@ __host__ __device__
  *  // result is now {-1, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/remove_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/remove_copy
  *  \see remove
  *  \see remove_copy
  *  \see remove_if
@@ -799,8 +797,6 @@ template<typename InputIterator1,
 /*! \} // end stream_compaction
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/remove.inl>
-
diff --git a/thrust/replace.h b/thrust/replace.h
index 225cb060a..a5c0320c4 100644
--- a/thrust/replace.h
+++ b/thrust/replace.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \addtogroup replacing
@@ -48,10 +46,10 @@ namespace thrust
  *  \param new_value The new value to replace \p old_value.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>,
  *          objects of \p T may be compared for equality with objects of
  *          \p ForwardIterator's \c value_type,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
@@ -78,7 +76,7 @@ namespace thrust
  *  // A contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace_if
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -100,10 +98,10 @@ __host__ __device__
  *  \param old_value The value to replace.
  *  \param new_value The new value to replace \p old_value.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html>Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">EqualityComparable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable>Assignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">EqualityComparable</a>,
  *          objects of \p T may be compared for equality with objects of
  *          \p ForwardIterator's \c value_type,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
@@ -128,7 +126,7 @@ __host__ __device__
  *  // A contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace_if
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -152,11 +150,11 @@ template<typename ForwardIterator, typename T>
  *         to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -192,7 +190,7 @@ template<typename ForwardIterator, typename T>
  *  // A contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -215,11 +213,11 @@ __host__ __device__
  *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
  *         to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -253,7 +251,7 @@ __host__ __device__
  *  // A contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -280,12 +278,12 @@ template<typename ForwardIterator, typename Predicate, typename T>
  *         to \c true.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -326,7 +324,7 @@ template<typename ForwardIterator, typename Predicate, typename T>
  *  // A contains [0, 20, 0, 40]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -352,12 +350,12 @@ __host__ __device__
  *  \param new_value The new value to replace elements which <tt>pred(*i)</tt> evaluates
  *         to \c true.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p replace_if to replace
@@ -396,7 +394,7 @@ __host__ __device__
  *  // A contains [0, 20, 0, 40]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace
  *  \see \c replace
  *  \see \c replace_copy
  *  \see \c replace_copy_if
@@ -427,10 +425,10 @@ template<typename ForwardIterator, typename InputIterator, typename Predicate, t
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          \p T may be compared for equality with \p InputIterator's \c value_type,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
@@ -454,7 +452,7 @@ template<typename ForwardIterator, typename InputIterator, typename Predicate, t
  *  // B contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c copy
  *  \see \c replace
  *  \see \c replace_if
@@ -484,10 +482,10 @@ __host__ __device__
  *  \param new_value The replacement value for which <tt>*i == old_value</tt> evaluates to \c true.
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
- *          \p T is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
+ *          \p T is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>,
  *          \p T may be compared for equality with \p InputIterator's \c value_type,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
@@ -510,7 +508,7 @@ __host__ __device__
  *  // B contains [99, 2, 3, 99]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c copy
  *  \see \c replace
  *  \see \c replace_if
@@ -541,11 +539,11 @@ template<typename InputIterator, typename OutputIterator, typename T>
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -580,7 +578,7 @@ template<typename InputIterator, typename OutputIterator, typename T>
  *  // B contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c replace
  *  \see \c replace_if
  *  \see \c replace_copy
@@ -609,11 +607,11 @@ __host__ __device__
  *  \param new_value The replacement value to assign <tt>pred(*i)</tt> evaluates to \c true.
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -647,7 +645,7 @@ __host__ __device__
  *  // B contains [1, 0, 2, 0]
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/replace_copy_if.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/replace_copy
  *  \see \c replace
  *  \see \c replace_if
  *  \see \c replace_copy
@@ -679,12 +677,12 @@ template<typename InputIterator, typename OutputIterator, typename Predicate, ty
  *  \return <tt>result + (last-first)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -755,12 +753,12 @@ __host__ __device__
  *  \param new_value The replacement value to assign when <tt>pred(*s)</tt> evaluates to \c true. 
  *  \return <tt>result + (last-first)</tt>
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \p InputIterator2's \c value_type is convertible to \p Predicate's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the ranges <tt>[first, last)</tt> and <tt>[result, result + (last - first))</tt> shall not overlap otherwise.
@@ -816,8 +814,6 @@ template<typename InputIterator1, typename InputIterator2, typename OutputIterat
  *  \} // transformations
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/replace.inl>
-
diff --git a/thrust/reverse.h b/thrust/reverse.h
index 7d08aeb77..056be200a 100644
--- a/thrust/reverse.h
+++ b/thrust/reverse.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reordering
  *  \ingroup algorithms
@@ -44,7 +42,7 @@ namespace thrust
  *  \param last The end of the range to reverse.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a> and
  *          \p BidirectionalIterator is mutable.
  *
  *  The following code snippet demonstrates how to use \p reverse to reverse a
@@ -62,7 +60,7 @@ namespace thrust
  *  // v is now {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse
  *  \see \p reverse_copy
  *  \see \p reverse_iterator
  */
@@ -80,7 +78,7 @@ __host__ __device__
  *  \param first The beginning of the range to reverse.
  *  \param last The end of the range to reverse.
  *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a> and
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a> and
  *          \p BidirectionalIterator is mutable.
  *
  *  The following code snippet demonstrates how to use \p reverse to reverse a
@@ -96,7 +94,7 @@ __host__ __device__
  *  // v is now {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse
  *  \see \p reverse_copy
  *  \see \p reverse_iterator
  */
@@ -105,7 +103,7 @@ template<typename BidirectionalIterator>
                BidirectionalIterator last);
 
 
-/*! \p reverse_copy differs from \ref reverse only in that the reversed range
+/*! \p reverse_copy differs from \p reverse only in that the reversed range
  *  is written to a different output range, rather than inplace.
  *
  *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
@@ -124,9 +122,9 @@ template<typename BidirectionalIterator>
  *  \param result The beginning of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a>,
  *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -147,7 +145,7 @@ template<typename BidirectionalIterator>
  *  // output is now  {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse_copy
  *  \see \p reverse
  *  \see \p reverse_iterator
  */
@@ -159,7 +157,7 @@ __host__ __device__
                               OutputIterator result);
 
 
-/*! \p reverse_copy differs from \ref reverse only in that the reversed range
+/*! \p reverse_copy differs from \p reverse only in that the reversed range
  *  is written to a different output range, rather than inplace.
  *
  *  \p reverse_copy copies elements from the range <tt>[first, last)</tt> to the
@@ -174,9 +172,9 @@ __host__ __device__
  *  \param last The end of the range to reverse.
  *  \param result The beginning of the output range.
  *
- *  \tparam BidirectionalIterator is a model of <a href="http://www.sgi.com/tech/stl/BidirectionalIterator.html">Bidirectional Iterator</a>,
+ *  \tparam BidirectionalIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/bidirectional_iterator">Bidirectional Iterator</a>,
  *          and \p BidirectionalIterator's \p value_type is convertible to \p OutputIterator's \p value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The range <tt>[first, last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -195,7 +193,7 @@ __host__ __device__
  *  // output is now  {5, 4, 3, 2, 1, 0}
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/reverse_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/reverse_copy
  *  \see \p reverse
  *  \see \p reverse_iterator
  */
@@ -208,8 +206,6 @@ template<typename BidirectionalIterator, typename OutputIterator>
 /*! \} // end reordering
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/reverse.inl>
-
diff --git a/thrust/scan.h b/thrust/scan.h
index 4543f2183..9b3814223 100644
--- a/thrust/scan.h
+++ b/thrust/scan.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -46,12 +44,16 @@ namespace thrust
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -61,10 +63,10 @@ namespace thrust
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -87,7 +89,7 @@ namespace thrust
  *  // data is now {1, 1, 3, 5, 6, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  *
  */
 template<typename DerivedPolicy,
@@ -108,21 +110,25 @@ __host__ __device__
  *  This version of \p inclusive_scan assumes plus as the associative operator.  
  *  When the input and output sequences are the same, the scan is performed 
  *  in-place.
- 
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -142,7 +148,7 @@ __host__ __device__
  *  // data is now {1, 1, 3, 5, 6, 9}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  *
  */
 template<typename InputIterator,
@@ -156,12 +162,16 @@ template<typename InputIterator,
  *  term 'inclusive' means that each result includes the corresponding
  *  input operand in the partial sum.  When the input and output sequences 
  *  are the same, the scan is performed in-place.
- *    
+ *
  *  \p inclusive_scan is similar to \c std::partial_sum in the STL.  The primary
  *  difference between the two functions is that \c std::partial_sum guarantees
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -172,14 +182,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -198,7 +208,7 @@ template<typename InputIterator,
  *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -222,20 +232,24 @@ __host__ __device__
  *  a serial summation order, while \p inclusive_scan requires associativity of 
  *  the binary operation to parallelize the prefix sum.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -253,7 +267,7 @@ __host__ __device__
  *  // data is now {-5, 0, 2, 2, 2, 4, 4, 4, 4, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -273,6 +287,10 @@ template<typename InputIterator,
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -282,10 +300,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -308,7 +326,7 @@ template<typename InputIterator,
  *  // data is now {0, 1, 1, 3, 5, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -328,16 +346,20 @@ __host__ __device__
  *  and so on. This version of \p exclusive_scan assumes plus as the 
  *  associative operator and \c 0 as the initial value.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined. If \c T is
  *                         \c OutputIterator's \c value_type, then <tt>T(0)</tt> is
@@ -357,7 +379,7 @@ __host__ __device__
  *  // data is now {0, 1, 1, 3, 5, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator>
@@ -375,6 +397,10 @@ template<typename InputIterator,
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -385,10 +411,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
@@ -409,7 +435,7 @@ template<typename InputIterator,
  *  // data is now {4, 5, 5, 7, 9, 10}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -432,16 +458,20 @@ __host__ __device__
  *  operator but requires an initial value \p init.  When the input and 
  *  output sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
  *  \param init The initial value.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's
  *                         \c value_type, then <tt>x + y</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
@@ -460,7 +490,7 @@ __host__ __device__
  *  // data is now {4, 5, 5, 7, 9, 10}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -476,10 +506,14 @@ template<typename InputIterator,
  *  corresponding input operand in the partial sum.  More precisely,
  *  \p init is assigned to <tt>\*result</tt> and the value
  *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
- *  and so on. This version of the function requires both and associative 
+ *  and so on. This version of the function requires both an associative 
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *    
  *  \param exec The execution policy to use for parallelization.
@@ -491,15 +525,15 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -523,7 +557,7 @@ template<typename InputIterator,
  *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -544,10 +578,14 @@ __host__ __device__
  *  corresponding input operand in the partial sum.  More precisely,
  *  \p init is assigned to <tt>\*result</tt> and the value
  *  <tt>binary_op(init, \*first)</tt> is assigned to <tt>\*(result + 1)</tt>,
- *  and so on. This version of the function requires both and associative 
+ *  and so on. This version of the function requires both an associative 
  *  operator and an initial value \p init.  When the input and output
  *  sequences are the same, the scan is performed in-place.
- *    
+ *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
@@ -555,15 +593,15 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to
  *                        \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>
  *                         and \c OutputIterator's \c value_type is convertible to
  *                         both \c AssociativeOperator's \c first_argument_type and
  *                         \c second_argument_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -584,7 +622,7 @@ __host__ __device__
  *  // data is now {1, 1, 1, 2, 2, 2, 4, 4, 4, 4 }
  *  \endcode
  *  
- *  \see http://www.sgi.com/tech/stl/partial_sum.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/partial_sum
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -620,6 +658,10 @@ template<typename InputIterator,
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -630,10 +672,10 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *
@@ -651,7 +693,7 @@ template<typename InputIterator,
  *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
  *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals); // in-place scan
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -689,16 +731,20 @@ __host__ __device__
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
  *  \param result The beginning of the output value sequence.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *
@@ -713,7 +759,7 @@ __host__ __device__
  *  int data[10] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
  *  int keys[10] = {0, 0, 0, 1, 1, 2, 3, 3, 3, 3};
  *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals); // in-place scan
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -748,6 +794,10 @@ template<typename InputIterator1,
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec. 
  *
  *  \param exec The execution policy to use for parallelization.
@@ -759,13 +809,13 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -784,7 +834,7 @@ template<typename InputIterator1,
  *
  *  thrust::equal_to<int> binary_pred;
  *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred); // in-place scan
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data, binary_pred); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -824,6 +874,10 @@ __host__ __device__
  *  operator used to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -831,13 +885,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality of keys.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
  *  \pre \p first2 may equal \p result but the range <tt>[first2, first2 + (last1 - first1)</tt> and range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -853,7 +907,7 @@ __host__ __device__
  *
  *  thrust::equal_to<int> binary_pred;
  *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred); // in-place scan
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data, binary_pred); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -890,6 +944,10 @@ template<typename InputIterator1,
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -902,14 +960,14 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -931,7 +989,7 @@ template<typename InputIterator1,
  *  thrust::equal_to<int> binary_pred;
  *  thrust::plus<int>     binary_op;
  *
- *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
+ *  thrust::inclusive_scan_by_key(thrust::host, keys, keys + 10, data, data, binary_pred, binary_op); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -969,6 +1027,10 @@ __host__ __device__
  *  belong to the same segment if <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  This version of \p inclusive_scan_by_key uses the associative operator 
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
@@ -981,14 +1043,14 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -1007,7 +1069,7 @@ __host__ __device__
  *  thrust::equal_to<int> binary_pred;
  *  thrust::plus<int>     binary_op;
  *
- *  thrust::inclusive_scan_by_key(keys, keys + 10, vals, vals, binary_pred, binary_op); // in-place scan
+ *  thrust::inclusive_scan_by_key(keys, keys + 10, data, data, binary_pred, binary_op); // in-place scan
  *
  *  // data is now {1, 2, 3, 1, 2, 1, 1, 2, 3, 4};
  *  \endcode
@@ -1044,6 +1106,10 @@ template<typename InputIterator1,
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  The algorithm's execution is parallelized as determined by \p exec.
@@ -1103,6 +1169,10 @@ __host__ __device__
  *  belong to the same segment if <tt>*i == *(i+1)</tt>, and belong to 
  *  different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  Refer to the most general form of \p exclusive_scan_by_key for additional details.
  *
  *  \param first1 The beginning of the key sequence.
@@ -1148,6 +1218,10 @@ template<typename InputIterator1,
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1208,6 +1282,10 @@ __host__ __device__
  *  This version of \p exclusive_scan_by_key uses the value \c init to
  *  initialize the exclusive scan operation.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1264,6 +1342,10 @@ template<typename InputIterator1,
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1334,6 +1416,10 @@ __host__ __device__
  *  <tt>i+1</tt> in the range <tt>[first1, last1)</tt> belong to the same segment if
  *  <tt>binary_pred(*i, *(i+1))</tt> is true, and belong to different segments otherwise.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1399,6 +1485,10 @@ template<typename InputIterator1,
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  The algorithm's execution is parallelized as determined by \p exec.
  *
  *  \param exec The execution policy to use for parallelization.
@@ -1412,15 +1502,15 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -1489,6 +1579,10 @@ __host__ __device__
  *  \c binary_op to perform the prefix sum. When the input and output sequences
  *  are the same, the scan is performed in-place.
  *
+ *  Results are not deterministic for pseudo-associative operators (e.g.,
+ *  addition of floating-point types). Results for pseudo-associative
+ *  operators may vary from run to run.
+ *
  *  \param first1 The beginning of the key sequence.
  *  \param last1 The end of the key sequence.
  *  \param first2 The beginning of the input value sequence.
@@ -1498,15 +1592,15 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>,
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>,
  *                         and if \c x and \c y are objects of \c OutputIterator's \c value_type, then 
  *                         <tt>binary_op(x,y)</tt> is defined.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c AssociativeOperator's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result but the range <tt>[first1, last1)</tt> and the range <tt>[result, result + (last1 - first1))</tt> shall not overlap otherwise.
@@ -1557,8 +1651,6 @@ template<typename InputIterator1,
 /*! \} // end prefix sums
  */
 
-	
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/scan.inl>
-
diff --git a/thrust/scatter.h b/thrust/scatter.h
index baaf1e63b..b8b0bd84f 100644
--- a/thrust/scatter.h
+++ b/thrust/scatter.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup scattering
  *  \ingroup copying
@@ -50,9 +48,9 @@ namespace thrust
  *  \param result Destination of the source elements.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -110,9 +108,9 @@ __host__ __device__
  *  \param map  Beginning of the sequence of output indices.
  *  \param result Destination of the source elements.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -171,10 +169,10 @@ template<typename InputIterator1,
  *  \param output Beginning of the destination range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -228,10 +226,10 @@ __host__ __device__
  *  \param stencil Beginning of the sequence of predicate values.
  *  \param output Beginning of the destination range.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c bool.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -286,11 +284,11 @@ template<typename InputIterator1,
  *  \param pred Predicate to apply to the stencil values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -359,11 +357,11 @@ __host__ __device__
  *  \param output Beginning of the destination range.
  *  \param pred Predicate to apply to the stencil values.
  *
- *  \tparam InputIterator1 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
- *  \tparam InputIterator2 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
- *  \tparam InputIterator3 must be a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
- *  \tparam RandomAccessIterator must be a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access iterator</a>.
- *  \tparam Predicate must be a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam InputIterator1 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator1's \c value_type must be convertible to \c RandomAccessIterator's \c value_type.
+ *  \tparam InputIterator2 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator2's \c value_type must be convertible to \c RandomAccessIterator's \c difference_type.
+ *  \tparam InputIterator3 must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a> and \c InputIterator3's \c value_type must be convertible to \c Predicate's \c argument_type.
+ *  \tparam RandomAccessIterator must be a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access iterator</a>.
+ *  \tparam Predicate must be a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre The iterator `result + i` shall not refer to any element referenced by any iterator `j` in the range `[first,last)` for all iterators `i` in the range `[map,map + (last - first))`.
  *
@@ -416,8 +414,6 @@ template<typename InputIterator1,
 /*! \} // end scattering
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/scatter.inl>
-
diff --git a/thrust/sequence.h b/thrust/sequence.h
index e92391f64..fb3959e3c 100644
--- a/thrust/sequence.h
+++ b/thrust/sequence.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -45,7 +43,7 @@ namespace thrust
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
@@ -66,7 +64,7 @@ namespace thrust
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator>
 __host__ __device__
@@ -83,7 +81,7 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
@@ -103,7 +101,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator>
   void sequence(ForwardIterator first,
@@ -123,11 +121,11 @@ template<typename ForwardIterator>
  *  \param init The first value of the sequence of numbers.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -147,7 +145,7 @@ template<typename ForwardIterator>
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -166,11 +164,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param init The first value of the sequence of numbers.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -188,7 +186,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator, typename T>
   void sequence(ForwardIterator first,
@@ -210,11 +208,11 @@ template<typename ForwardIterator, typename T>
  *  \param step The difference between consecutive elements.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -234,7 +232,7 @@ template<typename ForwardIterator, typename T>
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -255,11 +253,11 @@ __host__ __device__
  *  \param init The first value of the sequence of numbers
  *  \param step The difference between consecutive elements.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam T is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam T is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and \p T is convertible to \p ForwardIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p sequence to fill a range
@@ -277,7 +275,7 @@ __host__ __device__
  *  \note Unlike the similar C++ STL function \c std::iota, \p sequence offers no
  *        guarantee on order of execution.
  *
- *  \see http://www.sgi.com/tech/stl/iota.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/iota
  */
 template<typename ForwardIterator, typename T>
   void sequence(ForwardIterator first,
@@ -289,8 +287,7 @@ template<typename ForwardIterator, typename T>
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/sequence.inl>
 
diff --git a/thrust/set_operations.h b/thrust/set_operations.h
index a51eaed43..65a48d1b6 100644
--- a/thrust/set_operations.h
+++ b/thrust/set_operations.h
@@ -25,9 +25,7 @@
 #include <thrust/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup set_operations Set Operations
  *  \ingroup algorithms
@@ -61,17 +59,17 @@ namespace thrust
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -84,16 +82,16 @@ namespace thrust
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A1[7] = {0, 1, 3, 4, 5, 6, 9};
  *  int A2[5] = {1, 3, 5, 7, 9};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result is now {0, 4, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -136,17 +134,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -157,16 +155,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {0, 1, 3, 4, 5, 6, 9};
+ *  int A1[7] = {0, 1, 3, 4, 5, 6, 9};
  *  int A2[5] = {1, 3, 5, 7, 9};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result is now {0, 4, 6}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -211,14 +209,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -232,16 +230,16 @@ template<typename InputIterator1,
  *  #include <thrust/functional.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A1[7] = {9, 6, 5, 4, 3, 1, 0};
  *  int A2[5] = {9, 7, 5, 3, 1};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  int *result_end = thrust::set_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
  *  // result is now {6, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -287,14 +285,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -306,16 +304,16 @@ __host__ __device__
  *  #include <thrust/set_operations.h>
  *  #include <thrust/functional.h>
  *  ...
- *  int A1[6] = {9, 6, 5, 4, 3, 1, 0};
+ *  int A1[7] = {9, 6, 5, 4, 3, 1, 0};
  *  int A2[5] = {9, 7, 5, 3, 1};
  *
  *  int result[3];
  *
- *  int *result_end = thrust::set_difference(A1, A1 + 6, A2, A2 + 5, result, thrust::greater<int>());
+ *  int *result_end = thrust::set_difference(A1, A1 + 7, A2, A2 + 5, result, thrust::greater<int>());
  *  // result is now {6, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_difference
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -368,17 +366,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -400,7 +398,7 @@ template<typename InputIterator1,
  *  // result is now {1, 3, 5}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -450,17 +448,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -480,7 +478,7 @@ __host__ __device__
  *  // result is now {1, 3, 5}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -534,17 +532,17 @@ template<typename InputIterator1,
  *  \pre The resulting range shall not overlap with either input range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  The following code snippet demonstrates how to use \p set_intersection to compute
  *  the set intersection of sets of integers sorted in descending order using the \p thrust::host execution
@@ -563,7 +561,7 @@ template<typename InputIterator1,
  *  // result is now {5, 3, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -618,17 +616,17 @@ __host__ __device__
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  The following code snippet demonstrates how to use \p set_intersection to compute
  *  the set intersection of sets of integers sorted in descending order.
@@ -645,7 +643,7 @@ __host__ __device__
  *  // result is now {5, 3, 1}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_intersection.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_intersection
  *  \see \p includes
  *  \see \p set_union
  *  \see \p set_intersection
@@ -694,17 +692,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -717,16 +715,16 @@ template<typename InputIterator1,
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A1[7] = {0, 1, 2, 2, 4, 6, 7};
  *  int A2[5] = {1, 1, 2, 5, 8};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {0, 4, 5, 6, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -773,17 +771,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -794,16 +792,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {0, 1, 2, 2, 4, 6, 7};
+ *  int A1[7] = {0, 1, 2, 2, 4, 6, 7};
  *  int A2[5] = {1, 1, 2, 5, 8};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {0, 4, 5, 6, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -852,17 +850,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -875,16 +873,16 @@ template<typename InputIterator1,
  *  #include <thrust/set_operations.h>
  *  #include <thrust/execution_policy.h>
  *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A1[7] = {7, 6, 4, 2, 2, 1, 0};
  *  int A2[5] = {8, 5, 2, 1, 1};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(thrust::host, A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {8, 7, 6, 5, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -934,17 +932,17 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -955,16 +953,16 @@ __host__ __device__
  *  \code
  *  #include <thrust/set_operations.h>
  *  ...
- *  int A1[6] = {7, 6, 4, 2, 2, 1, 0};
+ *  int A1[7] = {7, 6, 4, 2, 2, 1, 0};
  *  int A2[5] = {8, 5, 2, 1, 1};
  *
  *  int result[6];
  *
- *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 6, A2, A2 + 5, result);
+ *  int *result_end = thrust::set_symmetric_difference(A1, A1 + 7, A2, A2 + 5, result);
  *  // result = {8, 7, 6, 5, 4, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_symmetric_difference.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_symmetric_difference
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_difference
@@ -1012,17 +1010,17 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1044,7 +1042,7 @@ template<typename InputIterator1,
  *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1089,17 +1087,17 @@ __host__ __device__
  *  \param result The beginning of the output range.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1119,7 +1117,7 @@ __host__ __device__
  *  // result = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1166,14 +1164,14 @@ template<typename InputIterator1,
  *  \return The end of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1196,7 +1194,7 @@ template<typename InputIterator1,
  *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1244,14 +1242,14 @@ __host__ __device__
  *  \param comp Comparison operator.
  *  \return The end of the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1's \c value_type is convertable to \p StrictWeakCompare's \c first_argument_type.
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2's \c value_type is convertable to \p StrictWeakCompare's \c second_argument_type.
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[first1, last1)</tt> and <tt>[first2, last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting range shall not overlap with either input range.
@@ -1272,7 +1270,7 @@ __host__ __device__
  *  // result = {12, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/set_union.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/set_union
  *  \see \p merge
  *  \see \p includes
  *  \see \p set_union
@@ -1330,22 +1328,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1431,22 +1429,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1532,23 +1530,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1638,23 +1636,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1745,20 +1743,20 @@ template<typename InputIterator1,
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1845,20 +1843,20 @@ __host__ __device__
  *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -1945,21 +1943,21 @@ template<typename InputIterator1,
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2050,21 +2048,21 @@ __host__ __device__
  *  \note Unlike the other key-value set operations, \p set_intersection_by_key is unique in that it has no
  *        \c values_first2 parameter because elements from the second input range are never copied to the output range.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2153,22 +2151,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2257,22 +2255,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2361,23 +2359,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2470,23 +2468,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2575,22 +2573,22 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2677,22 +2675,22 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to <tt>operator<</tt>.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2779,23 +2777,23 @@ template<typename InputIterator1,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2886,23 +2884,23 @@ __host__ __device__
  *  \return A \p pair \c p such that <tt>p.first</tt> is the end of the output range of keys,
  *          and such that <tt>p.second</tt> is the end of the output range of values.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator1 and \p InputIterator2 have the same \c value_type,
- *          \p InputIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator1's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator1's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          \p InputIterator2 and \p InputIterator1 have the same \c value_type,
- *          \p InputIterator2's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a>,
- *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="http://www.sgi.com/tech/stl/LessThanComparable">LessThan Comparable</a> requirements,
+ *          \p InputIterator2's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
+ *          the ordering on \p InputIterator2's \c value_type is a strict weak ordering, as defined in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements,
  *          and \p InputIterator2's \c value_type is convertable to a type in \p OutputIterator's set of \c value_types.
- *  \tparam InputIterator3 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator3 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator3's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam InputIterator4 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator4 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator4's \c value_type is convertible to a type in \p OutputIterator2's set of \c value_types.
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam StrictWeakCompare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam StrictWeakCompare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The ranges <tt>[keys_first1, keys_last1)</tt> and <tt>[keys_first2, keys_last2)</tt> shall be sorted with respect to \p comp.
  *  \pre The resulting ranges shall not overlap with any input range.
@@ -2956,8 +2954,6 @@ template<typename InputIterator1,
 /*! \} // end set_operations
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/set_operations.inl>
-
diff --git a/thrust/shuffle.h b/thrust/shuffle.h
new file mode 100644
index 000000000..d95327e29
--- /dev/null
+++ b/thrust/shuffle.h
@@ -0,0 +1,179 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Reorders range by a uniform random permutation
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup reordering
+*  \ingroup algorithms
+*
+*  \addtogroup shuffling
+*  \ingroup reordering
+*  \{
+*/
+
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation
+ *  using the \p thrust::host execution policy for parallelization:
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(thrust::host, A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename DerivedPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, URBG&& g);
+
+/*! \p shuffle reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle(A, A + N, g);
+ *  // A is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle_copy
+ */
+template <typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(RandomIterator first, RandomIterator last,
+                                 URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *  \p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  #include <thrust/execution_policy.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(thrust::host, A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename DerivedPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy>& exec,
+    RandomIterator first, RandomIterator last, OutputIterator result, URBG&& g);
+
+/*! shuffle_copy differs from shuffle only in that the reordered sequence is written to different output sequences, rather than in place.
+ *\p shuffle_copy reorders the elements <tt>[first, last)</tt> by a uniform pseudorandom permutation, defined by
+ *  random engine \p g.
+ *
+ *  \param first The beginning of the sequence to shuffle.
+ *  \param last The end of the sequence to shuffle.
+ *  \param result Destination of shuffled sequence
+ *  \param g A UniformRandomBitGenerator
+ *
+ *  \tparam RandomIterator is a random access iterator
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam URBG is a uniform random bit generator
+ *
+ *  The following code snippet demonstrates how to use \p shuffle_copy to create a random permutation.
+ *
+ *  \code
+ *  #include <thrust/shuffle.h>
+ *  #include <thrust/random.h>
+ *  int A[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ *  int result[10];
+ *  const int N = sizeof(A)/sizeof(int);
+ *  thrust::default_random_engine g;
+ *  thrust::shuffle_copy(A, A + N, result, g);
+ *  // result is now {6, 5, 8, 7, 2, 1, 4, 3, 10, 9}
+ *  \endcode
+ *
+ *  \see \p shuffle
+ */
+template <typename RandomIterator, typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(RandomIterator first, RandomIterator last,
+                                      OutputIterator result, URBG&& g);
+
+THRUST_NAMESPACE_END
+
+#include <thrust/detail/shuffle.inl>
+#endif
diff --git a/thrust/sort.h b/thrust/sort.h
index c4e90320c..5cf9d6217 100644
--- a/thrust/sort.h
+++ b/thrust/sort.h
@@ -15,7 +15,7 @@
  */
 
 
-/*! \file sort.h
+/*! \file thrust/sort.h
  *  \brief Functions for reorganizing ranges into sorted order
  */
 
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup sorting
  *  \ingroup algorithms
@@ -51,11 +49,11 @@ namespace thrust
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers using the \p thrust::host execution policy for parallelization:
@@ -70,7 +68,7 @@ namespace thrust
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -94,11 +92,11 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers.
@@ -112,7 +110,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -140,11 +138,11 @@ template<typename RandomAccessIterator>
  *  \param comp  Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
@@ -160,7 +158,7 @@ template<typename RandomAccessIterator>
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -189,11 +187,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param comp  Comparison operator.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator.
@@ -208,7 +206,7 @@ __host__ __device__
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort
  *  \see \p sort_by_key
  */
@@ -241,11 +239,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers using the \p thrust::host execution policy for parallelization:
@@ -260,7 +258,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -288,11 +286,11 @@ __host__ __device__
  *  \param first The beginning of the sequence.
  *  \param last The end of the sequence.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
- *          and \p RandomAccessIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *  The following code snippet demonstrates how to use \p sort to sort
  *  a sequence of integers.
@@ -306,7 +304,7 @@ __host__ __device__
  *  // A is now {1, 2, 4, 5, 7, 8}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -338,11 +336,11 @@ template<typename RandomAccessIterator>
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator using the \p thrust::host execution policy for parallelization:
@@ -358,7 +356,7 @@ template<typename RandomAccessIterator>
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -391,11 +389,11 @@ __host__ __device__
  *  \param last The end of the sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator is mutable,
  *          and \p RandomAccessIterator's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code demonstrates how to sort integers in descending order
  *  using the greater<int> comparison operator.
@@ -410,7 +408,7 @@ __host__ __device__
  *  // A is now {8, 7, 5, 4, 2, 1};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/stable_sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/stable_sort
  *  \see \p sort
  *  \see \p stable_sort_by_key
  */
@@ -450,12 +448,12 @@ template<typename RandomAccessIterator,
  *  \param values_first The beginning of the value sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -476,7 +474,7 @@ template<typename RandomAccessIterator,
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -510,12 +508,12 @@ __host__ __device__
  *  \param keys_last The end of the key sequence.
  *  \param values_first The beginning of the value sequence.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -534,7 +532,7 @@ __host__ __device__
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -571,13 +569,13 @@ template<typename RandomAccessIterator1,
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -597,7 +595,7 @@ template<typename RandomAccessIterator1,
  *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -635,13 +633,13 @@ __host__ __device__
  *  \param values_first The beginning of the value sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -660,7 +658,7 @@ __host__ __device__
  *  // values is now {'d', 'f', 'e', 'b', 'c', 'a'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p stable_sort_by_key
  *  \see \p sort
  */
@@ -699,12 +697,12 @@ template<typename RandomAccessIterator1,
  *  \param values_first The beginning of the value sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -725,7 +723,7 @@ template<typename RandomAccessIterator1,
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -761,12 +759,12 @@ __host__ __device__
  *  \param keys_last The end of the key sequence.
  *  \param values_first The beginning of the value sequence.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
- *          and \p RandomAccessIterator1's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *          and \p RandomAccessIterator1's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering relation on \p RandomAccessIterator1's \c value_type is a <em>strict weak ordering</em>, as defined in the
- *          <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *          <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
@@ -785,7 +783,7 @@ __host__ __device__
  *  // values is now {'a', 'c', 'b', 'e', 'f', 'd'}
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -824,13 +822,13 @@ template<typename RandomAccessIterator1,
  *  \param comp Comparison operator.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -851,7 +849,7 @@ template<typename RandomAccessIterator1,
  *  \endcode
  *
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -891,13 +889,13 @@ __host__ __device__
  *  \param values_first The beginning of the value sequence.
  *  \param comp Comparison operator.
  *
- *  \tparam RandomAccessIterator1 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.html">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/random_access_iterator">Random Access Iterator</a>,
  *          \p RandomAccessIterator1 is mutable,
  *          and \p RandomAccessIterator1's \c value_type is convertible to \p StrictWeakOrdering's
  *          \c first_argument_type and \c second_argument_type.
- *  \tparam RandomAccessIterator2 is a model of <a href="http://www.sgi.com/tech/stl/RandomAccessIterator.hml">Random Access Iterator</a>,
+ *  \tparam RandomAccessIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">Random Access Iterator</a>,
  *          and \p RandomAccessIterator2 is mutable.
- *  \tparam StrictWeakOrdering is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam StrictWeakOrdering is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last))</tt> shall not overlap the range <tt>[values_first, values_first + (keys_last - keys_first))</tt>.
  *
@@ -917,7 +915,7 @@ __host__ __device__
  *  \endcode
  *
  *
- *  \see http://www.sgi.com/tech/stl/sort.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/sort
  *  \see \p sort_by_key
  *  \see \p stable_sort
  */
@@ -956,10 +954,10 @@ template<typename RandomAccessIterator1,
  *  \return \c true, if the sequence is sorted; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *
  *  The following code demonstrates how to use \p is_sorted to test whether the
@@ -990,7 +988,7 @@ template<typename RandomAccessIterator1,
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see is_sorted_until
  *  \see \c sort
  *  \see \c stable_sort
@@ -1014,10 +1012,10 @@ __host__ __device__
  *  \param last  The end of the sequence.
  *  \return \c true, if the sequence is sorted; \c false, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>,
  *          and the ordering on objects of \p ForwardIterator's \c value_type is a <em>strict weak ordering</em>, as defined
- *          in the <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a> requirements.
+ *          in the <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a> requirements.
  *
  *
  *  The following code demonstrates how to use \p is_sorted to test whether the
@@ -1046,7 +1044,7 @@ __host__ __device__
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see is_sorted_until
  *  \see \c sort
  *  \see \c stable_sort
@@ -1072,10 +1070,10 @@ template<typename ForwardIterator>
  *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
  *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted to test whether the
  *  contents of a \c device_vector are stored in descending order using the \p thrust::device execution
@@ -1106,7 +1104,7 @@ template<typename ForwardIterator>
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see \c sort
  *  \see \c stable_sort
  *  \see \c less<T>
@@ -1130,10 +1128,10 @@ __host__ __device__
  *  \param comp  Comparison operator.
  *  \return \c true, if the sequence is sorted according to comp; \c false, otherwise.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator's \c value_type is convertible to both \c StrictWeakOrdering's \c first_argument_type
  *          and \c second_argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted to test whether the
  *  contents of a \c device_vector are stored in descending order.
@@ -1162,7 +1160,7 @@ __host__ __device__
  *  // result == true
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/is_sorted.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/is_sorted
  *  \see \c sort
  *  \see \c stable_sort
  *  \see \c less<T>
@@ -1185,8 +1183,8 @@ template<typename ForwardIterator, typename Compare>
  *  \return The last iterator in the input range for which it is sorted.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted using the \p thrust::host execution policy for
@@ -1227,8 +1225,8 @@ __host__ __device__
  *  \param last The end of the range of interest.
  *  \return The last iterator in the input range for which it is sorted.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
- *          \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/LessThanComparable.html">LessThan Comparable</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
+ *          \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/named_req/LessThanComparable">LessThan Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted:
@@ -1270,9 +1268,9 @@ template<typename ForwardIterator>
  *  \return The last iterator in the input range for which it is sorted.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
  *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted in descending order using the \p thrust::host execution
@@ -1317,9 +1315,9 @@ __host__ __device__
  *  \param comp The function object to use for comparison.
  *  \return The last iterator in the input range for which it is sorted.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a> and
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a> and
  *          \p ForwardIterator's \c value_type is convertible to \p Compare's \c argument_type.
- *  \tparam Compare is a model of <a href="http://www.sgi.com/tech/stl/StrictWeakOrdering.html">Strict Weak Ordering</a>.
+ *  \tparam Compare is a model of <a href="https://en.cppreference.com/w/cpp/concepts/strict_weak_order">Strict Weak Ordering</a>.
  *
  *  The following code snippet demonstrates how to use \p is_sorted_until to find the first position
  *  in an array where the data becomes unsorted in descending order:
@@ -1355,8 +1353,6 @@ template<typename ForwardIterator, typename Compare>
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/sort.inl>
-
diff --git a/thrust/swap.h b/thrust/swap.h
index 246e84387..d8a8be73c 100644
--- a/thrust/swap.h
+++ b/thrust/swap.h
@@ -23,12 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-// empty Doxygen comment below so namespace thrust's documentation will be extracted
-
-/*!
- */
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -47,7 +42,7 @@ namespace thrust
  *  \param b The second value of interest. After completion,
  *           the value of a will be returned here.
  *
- *  \tparam Assignable is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>.
+ *  \tparam Assignable is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>.
  *
  *  The following code snippet demonstrates how to use \p swap to
  *  swap the contents of two variables.
@@ -94,9 +89,9 @@ inline void swap(Assignable1 &a, Assignable2 &b);
  *          sequence to swap.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
  *
  *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
@@ -121,7 +116,7 @@ inline void swap(Assignable1 &a, Assignable2 &b);
  *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/swap_ranges
  *  \see \c swap
  */
 template<typename DerivedPolicy,
@@ -146,9 +141,9 @@ __host__ __device__
  *  \return An iterator pointing to one position past the last element of the second
  *          sequence to swap.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1's \c value_type must be convertible to \p ForwardIterator2's \c value_type.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2's \c value_type must be convertible to \p ForwardIterator1's \c value_type.
  *
  *  \pre \p first1 may equal \p first2, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[first2, first2 + (last1 - first1))</tt> otherwise.
@@ -171,7 +166,7 @@ __host__ __device__
  *  // v1[0] == 3, v1[1] == 4, v2[0] == 1, v2[1] == 2
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/swap_ranges.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/swap_ranges
  *  \see \c swap
  */
 template<typename ForwardIterator1,
@@ -184,8 +179,6 @@ template<typename ForwardIterator1,
 /*! \} // copying
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/swap.inl>
-
diff --git a/thrust/system/cpp/detail/execution_policy.h b/thrust/system/cpp/detail/execution_policy.h
index ea884250c..1a8193bf3 100644
--- a/thrust/system/cpp/detail/execution_policy.h
+++ b/thrust/system/cpp/detail/execution_policy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -56,11 +55,8 @@ template<typename Derived>
   struct execution_policy
     : thrust::system::detail::sequential::execution_policy<Derived>
 {
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
 };
 
 } // end detail
@@ -80,5 +76,5 @@ using thrust::system::cpp::execution_policy;
 using thrust::system::cpp::tag;
 
 } // end cpp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/memory.inl b/thrust/system/cpp/detail/memory.inl
index dd779f14b..650aa1cb5 100644
--- a/thrust/system/cpp/detail/memory.inl
+++ b/thrust/system/cpp/detail/memory.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,59 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/memory.h>
 #include <thrust/system/cpp/detail/malloc_and_free.h>
 #include <limits>
 
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cpp::pointer<T> >
-{
-  typedef typename thrust::cpp::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
 {
 
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
 pointer<void> malloc(std::size_t n)
 {
   tag t;
@@ -88,5 +48,5 @@ void free(pointer<void> ptr)
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/par.h b/thrust/system/cpp/detail/par.h
index ebee4ad40..c56921327 100644
--- a/thrust/system/cpp/detail/par.h
+++ b/thrust/system/cpp/detail/par.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,11 +17,10 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -30,23 +29,19 @@ namespace detail
 {
 
 
-struct par_t : thrust::system::cpp::detail::execution_policy<par_t>
+struct par_t : thrust::system::cpp::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::cpp::detail::execution_policy>
 {
-  par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::cpp::detail::execution_policy>(alloc);
-  }
+  __host__ __device__
+  constexpr par_t() : thrust::system::cpp::detail::execution_policy<par_t>() {}
 };
 
 
 } // end detail
 
 
-static const detail::par_t par;
+THRUST_INLINE_CONSTANT detail::par_t par;
 
 
 } // end cpp
@@ -62,5 +57,5 @@ using thrust::system::cpp::par;
 
 
 } // end cpp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/detail/per_device_resource.h b/thrust/system/cpp/detail/per_device_resource.h
new file mode 100644
index 000000000..1b8d61f92
--- /dev/null
+++ b/thrust/system/cpp/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/system/cpp/detail/vector.inl b/thrust/system/cpp/detail/vector.inl
index 4f6dfa044..02980c62a 100644
--- a/thrust/system/cpp/detail/vector.inl
+++ b/thrust/system/cpp/detail/vector.inl
@@ -18,9 +18,9 @@
 
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/vector.h>
+#include <utility>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -50,6 +50,14 @@ template<typename T, typename Allocator>
       : super_t(x)
 {}
 
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(vector &&x)
+        : super_t(std::move(x))
+  {}
+#endif
+
 template<typename T, typename Allocator>
   template<typename OtherT, typename OtherAllocator>
     vector<T,Allocator>
@@ -71,6 +79,47 @@ template<typename T, typename Allocator>
         : super_t(first,last)
 {}
 
+template<typename T, typename Allocator>
+  vector<T,Allocator> &
+    vector<T,Allocator>
+      ::operator=(const vector &x)
+{
+  super_t::operator=(x);
+  return *this;
+}
+
+#if THRUST_CPP_DIALECT >= 2011
+  template<typename T, typename Allocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(vector &&x)
+  {
+    super_t::operator=(std::move(x));
+    return *this;
+  }
+#endif
+  
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(std::initializer_list<T> il)
+        : super_t(il)
+  {}
+  
+  template<typename T, typename Allocator>
+    vector<T,Allocator>
+      ::vector(std::initializer_list<T> il, const Allocator& alloc)
+        : super_t(il, alloc)
+  {}
+
+  template<typename T, typename Allocator>
+    vector<T,Allocator> &
+      vector<T,Allocator>
+        ::operator=(std::initializer_list<T> il)
+  {
+    super_t::operator=(il);
+    return *this;
+  }
+
 template<typename T, typename Allocator>
   template<typename OtherT, typename OtherAllocator>
     vector<T,Allocator> &
@@ -93,5 +142,5 @@ template<typename T, typename Allocator>
       
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cpp/execution_policy.h b/thrust/system/cpp/execution_policy.h
index 203ba0ae7..0d8a9a367 100644
--- a/thrust/system/cpp/execution_policy.h
+++ b/thrust/system/cpp/execution_policy.h
@@ -14,12 +14,12 @@
  *  limitations under the License.
  */
 
-#pragma once
-
 /*! \file thrust/system/cpp/execution_policy.h
- *  \brief Execution policies for Thrust's standard C++ system.
+ *  \brief Execution policies for Thrust's Standard C++ system.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 
 // get the execution policies definitions first
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cpp
@@ -104,7 +103,7 @@ struct execution_policy : thrust::execution_policy<DerivedPolicy>
 struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
 
 
-/*! 
+/*!
  *  \p thrust::system::cpp::par is the parallel execution policy associated with Thrust's standard
  *  C++ backend system.
  *
@@ -130,7 +129,7 @@ struct tag : thrust::system::cpp::execution_policy<tag> { unspecified };
  *    __host__ __device__
  *    void operator()(int x)
  *    {
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -151,7 +150,7 @@ static const unspecified par;
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/cpp/memory.h b/thrust/system/cpp/memory.h
index 253e550bc..a18abeb8e 100644
--- a/thrust/system/cpp/memory.h
+++ b/thrust/system/cpp/memory.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,294 +15,21 @@
  */
 
 /*! \file thrust/system/cpp/memory.h
- *  \brief Managing memory associated with Thrust's standard C++ system.
+ *  \brief Managing memory associated with Thrust's Standard C++ system.
  */
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cpp/execution_policy.h>
+#include <thrust/system/cpp/memory_resource.h>
 #include <thrust/memory.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
-namespace system
-{
-namespace cpp
-{
-
-template<typename> class pointer;
-
-} // end cpp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::cpp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::cpp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::cpp
- *  \brief \p thrust::system::cpp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's standard C++ backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::cpp</tt>
- *         namespace for easy access.
- *
- */
-namespace cpp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::cpp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the cpp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cpp memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p cpp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see cpp::malloc
- *  \see cpp::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::cpp::tag,
-               thrust::system::cpp::reference<T>,
-               thrust::system::cpp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::cpp::tag,
-      //thrust::system::cpp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::cpp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that cpp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p cpp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cpp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cpp system.
- *  \p reference is the type of the result of dereferencing a \p cpp::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::cpp::pointer<T>,
-               thrust::system::cpp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::cpp::pointer<T>,
-      thrust::system::cpp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
 
 /*! Allocates an area of memory available to Thrust's <tt>cpp</tt> system.
  *  \param n Number of bytes to allocate.
@@ -337,78 +64,37 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p cpp::allocator is the default allocator used by the \p cpp system's containers such as
- *  <tt>cpp::vector</tt> if no user-specified allocator is provided. \p cpp::allocator allocates
- *  (deallocates) storage with \p cpp::malloc (\p cpp::free).
+/*! \p cpp::allocator is the default allocator used by the \p cpp system's
+ *  containers such as <tt>cpp::vector</tt> if no user-specified allocator is
+ *  provided. \p cpp::allocator allocates (deallocates) storage with \p
+ *  cpp::malloc (\p cpp::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cpp::memory_resource
+>;
 
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end cpp
-
-/*! \}
+/*! \p cpp::universal_allocator allocates memory that can be used by the \p cpp
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cpp::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::cpp
 
 /*! \namespace thrust::cpp
  *  \brief \p thrust::cpp is a top-level alias for thrust::system::cpp.
  */
 namespace cpp
 {
-
-using thrust::system::cpp::pointer;
-using thrust::system::cpp::reference;
 using thrust::system::cpp::malloc;
 using thrust::system::cpp::free;
 using thrust::system::cpp::allocator;
+} // namespace cpp
 
-} // end cpp
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cpp/detail/memory.inl>
 
diff --git a/thrust/system/cpp/memory_resource.h b/thrust/system/cpp/memory_resource.h
new file mode 100644
index 000000000..04b4e3cf8
--- /dev/null
+++ b/thrust/system/cpp/memory_resource.h
@@ -0,0 +1,71 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cpp/memory_resource.h
+ *  \brief Memory resources for the Standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/cpp/pointer.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::cpp::pointer<void>
+    > native_resource;
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::cpp::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! The memory resource for the Standard C++ system. Uses \p
+ *  mr::new_delete_resource and tags it with \p cpp::pointer.
+ */
+typedef detail::native_resource memory_resource;
+/*! The unified memory resource for the Standard C++ system. Uses
+ *  \p mr::new_delete_resource and tags it with \p cpp::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p cpp::universal_memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \} // memory_resources
+ */
+
+
+}} // namespace system::cpp
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cpp/pointer.h b/thrust/system/cpp/pointer.h
new file mode 100644
index 000000000..f204fa375
--- /dev/null
+++ b/thrust/system/cpp/pointer.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cpp/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <type_traits>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
+{
+
+/*! \p cpp::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p cpp system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p cpp memory.
+ *
+ *  \p cpp::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p cpp::pointer can be created with the function \p cpp::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cpp::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
+ *
+ *  \note \p cpp::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p cpp::pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cpp::malloc
+ *  \see cpp::free
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::cpp::tag,
+  thrust::tagged_reference<T, thrust::system::cpp::tag>
+>;
+
+/*! \p cpp::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p cpp system and host systems.
+ *
+ *  \p cpp::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p cpp::universal_pointer can be created with \p cpp::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cpp::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p cpp::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p cpp::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cpp::universal_allocator
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::cpp::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p cpp system. \p reference is the type of the result of
+ *  dereferencing a \p cpp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template <typename T>
+using reference = thrust::reference<T, thrust::system::cpp::tag>;
+
+}} // namespace system::cpp
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::cpp
+ *  \brief \p thrust::cpp is a top-level alias for \p thrust::system::cpp. */
+namespace cpp
+{
+using thrust::system::cpp::pointer;
+using thrust::system::cpp::universal_pointer;
+using thrust::system::cpp::reference;
+} // namespace cpp
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cpp/vector.h b/thrust/system/cpp/vector.h
index 357bbd07f..2a418dbc3 100644
--- a/thrust/system/cpp/vector.h
+++ b/thrust/system/cpp/vector.h
@@ -26,124 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace cpp
 {
 
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace cpp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
 /*! \p cpp::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p cpp::vector may vary dynamically; memory management is
  *  automatic. The elements contained in a \p cpp::vector reside in memory
- *  available to the \p cpp system.
+ *  accessible by the \p cpp system.
  *
  *  \tparam T The element type of the \p cpp::vector.
- *  \tparam Allocator The allocator type of the \p cpp::vector. Defaults to \p cpp::allocator.
+ *  \tparam Allocator The allocator type of the \p cpp::vector.
+ *          Defaults to \p cpp::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p cpp::vector
+ *                   shared by \p cpp::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cpp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cpp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cpp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cpp::vector with \p n copies of \p value.
-     *  \param n The size of the \p cpp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cpp::vector.
-     *  \param x The other \p cpp::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cpp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::cpp::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p cpp::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p cpp::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p cpp::universal_vector reside in memory accessible by the \p cpp system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p cpp::universal_vector.
+ *  \tparam Allocator The allocator type of the \p cpp::universal_vector.
+ *          Defaults to \p cpp::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cpp::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::cpp::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end cpp
-} // end system
+}} // namespace system::cpp
 
-// alias system::cpp names at top-level
 namespace cpp
 {
-
 using thrust::system::cpp::vector;
+using thrust::system::cpp::universal_vector;
+}
 
-} // end cpp
-
-} // end thrust
-
-#include <thrust/system/cpp/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/config.h b/thrust/system/cuda/config.h
new file mode 100644
index 000000000..f6c8b9cb3
--- /dev/null
+++ b/thrust/system/cuda/config.h
@@ -0,0 +1,146 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+
+#ifdef THRUST_DEBUG_SYNC
+#define THRUST_DEBUG_SYNC_FLAG true
+#define CUB_DEBUG_SYNC
+#else
+#define THRUST_DEBUG_SYNC_FLAG false
+#endif
+
+
+#include <thrust/detail/config.h>
+
+// We don't directly include <cub/version.cuh> since it doesn't exist in
+// older releases. This header will always pull in version info:
+#include <cub/util_namespace.cuh>
+#include <cub/util_debug.cuh>
+
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+/**
+ * \def THRUST_RUNTIME_FUNCTION
+ *
+ * Execution space for functions that can use the CUDA runtime API (`__host__`
+ * when RDC is off, `__host__ __device__` when RDC is on).
+ */
+#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION
+
+/**
+ * \def THRUST_RDC_ENABLED
+ *
+ * Defined if RDC is enabled.
+ */
+#ifdef CUB_RDC_ENABLED
+#define THRUST_RDC_ENABLED
+#endif
+
+/**
+ * \def __THRUST_HAS_CUDART__
+ *
+ * Whether or not the active compiler pass is allowed to invoke device kernels
+ * or methods from the CUDA runtime API.
+ *
+ * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__`
+ * and is not compatible with `NV_IF_TARGET`. It is provided for legacy
+ * purposes only.
+ *
+ * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`.
+ */
+#ifdef CUB_RUNTIME_ENABLED
+#define __THRUST_HAS_CUDART__ 1
+#else
+#define __THRUST_HAS_CUDART__ 0
+#endif
+
+// These definitions were intended for internal use only and are now obsolete.
+// If you relied on them, consider porting your code to use the functionality
+// in libcu++'s <nv/target> header.
+//
+// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make
+// them available again. These should be considered deprecated and will be
+// fully removed in a future version.
+#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS
+#ifdef __CUDA_ARCH__
+#define THRUST_DEVICE_CODE
+#endif // __CUDA_ARCH__
+#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS
+
+#ifdef THRUST_AGENT_ENTRY_NOINLINE
+#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__
+#else
+#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__
+#endif
+
+#define THRUST_DEVICE_FUNCTION __device__ __forceinline__
+#define THRUST_HOST_FUNCTION __host__     __forceinline__
+#define THRUST_FUNCTION __host__ __device__ __forceinline__
+#if 0
+#define THRUST_ARGS(...) __VA_ARGS__
+#define THRUST_STRIP_PARENS(X) X
+#define THRUST_AGENT_ENTRY(ARGS) THRUST_FUNCTION static void entry(THRUST_STRIP_PARENS(THRUST_ARGS ARGS))
+#else
+#define THRUST_AGENT_ENTRY(...) THRUST_AGENT_ENTRY_INLINE_ATTR __device__ static void entry(__VA_ARGS__)
+#endif
+
+#ifndef THRUST_IGNORE_CUB_VERSION_CHECK
+
+#include <thrust/version.h>
+#if THRUST_VERSION != CUB_VERSION
+#error The version of CUB in your include path is not compatible with this release of Thrust. CUB is now included in the CUDA Toolkit, so you no longer need to use your own checkout of CUB. Define THRUST_IGNORE_CUB_VERSION_CHECK to ignore this.
+#endif
+
+// Make sure the CUB namespace has been declared using the modern macros:
+CUB_NAMESPACE_BEGIN
+CUB_NAMESPACE_END
+
+#else // THRUST_IGNORE_CUB_VERSION_CHECK
+
+// Make sure the CUB namespace has been declared. Use the old macros for compat
+// with older CUB:
+CUB_NS_PREFIX
+namespace cub {}
+CUB_NS_POSTFIX
+
+// Older versions of CUB do not define this. Set it to a reasonable default if
+// not provided.
+#ifndef CUB_NS_QUALIFIER
+#define CUB_NS_QUALIFIER ::cub
+#endif
+
+#endif // THRUST_IGNORE_CUB_VERSION_CHECK
+
+// Pull the fully qualified cub:: namespace into the thrust:: namespace so we
+// don't have to use CUB_NS_QUALIFIER as long as we're in thrust::.
+THRUST_NAMESPACE_BEGIN
+namespace cub
+{
+using namespace CUB_NS_QUALIFIER;
+}
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/adjacent_difference.h b/thrust/system/cuda/detail/adjacent_difference.h
index 1d6dba560..284611235 100644
--- a/thrust/system/cuda/detail/adjacent_difference.h
+++ b/thrust/system/cuda/detail/adjacent_difference.h
@@ -1,54 +1,294 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
+#pragma once
 
+#include <thrust/detail/config.h>
 
-/*! \file adjacent_difference.h
- *  \brief CUDA implementation of adjacent_difference.
- */
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#pragma once
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
+#include <cub/device/device_adjacent_difference.cuh>
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+THRUST_NAMESPACE_BEGIN
+
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename BinaryFunction>
+__host__ __device__ OutputIterator
+adjacent_difference(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator                                               first,
+    InputIterator                                               last,
+    OutputIterator                                              result,
+    BinaryFunction                                              binary_op);
+
+namespace cuda_cub {
+
+namespace __adjacent_difference {
+
+  template <bool MayAlias,
+            class InputIt,
+            class OutputIt,
+            class BinaryOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
+            cudaStream_t stream)
+  {
+    if (num_items == 0)
+    {
+      return cudaSuccess;
+    }
+
+    constexpr bool may_alias = MayAlias;
+    constexpr bool read_left = true;
+
+    using Dispatch32 = cub::DispatchAdjacentDifference<InputIt,
+                                                       OutputIt,
+                                                       BinaryOp,
+                                                       thrust::detail::int32_t,
+                                                       may_alias,
+                                                       read_left>;
+    using Dispatch64 = cub::DispatchAdjacentDifference<InputIt,
+                                                       OutputIt,
+                                                       BinaryOp,
+                                                       thrust::detail::int64_t,
+                                                       may_alias,
+                                                       read_left>;
+
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (d_temp_storage,
+                                 temp_storage_bytes,
+                                 first,
+                                 result,
+                                 num_items_fixed,
+                                 binary_op,
+                                 stream));
+    return status;
+  }
+
+  template <class InputIt,
+            class OutputIt,
+            class BinaryOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, false> /* comparable */)
+  {
+    constexpr bool may_alias = true;
+    return doit_step<may_alias>(d_temp_storage,
+                                temp_storage_bytes,
+                                first,
+                                result,
+                                binary_op,
+                                num_items,
+                                stream);
+  }
+
+  template <class InputIt,
+            class OutputIt,
+            class BinaryOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            InputIt first,
+            OutputIt result,
+            BinaryOp binary_op,
+            std::size_t num_items,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, true> /* comparable */)
+  {
+    // The documentation states that pointers might be equal but can't alias in
+    // any other way. That is, the distance should be equal to zero or exceed
+    // `num_items`. In the latter case, we use an optimized version.
+    if (first != result)
+    {
+      constexpr bool may_alias = false;
+      return doit_step<may_alias>(d_temp_storage,
+                                  temp_storage_bytes,
+                                  first,
+                                  result,
+                                  binary_op,
+                                  num_items,
+                                  stream);
+    }
 
+    constexpr bool may_alias = true;
+    return doit_step<may_alias>(d_temp_storage,
+                                temp_storage_bytes,
+                                first,
+                                result,
+                                binary_op,
+                                num_items,
+                                stream);
+  }
 
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-__host__ __device__
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op);
+  template <typename Derived,
+            typename InputIt,
+            typename OutputIt,
+            typename BinaryOp>
+  OutputIt THRUST_RUNTIME_FUNCTION
+  adjacent_difference(execution_policy<Derived>& policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      OutputIt                   result,
+                      BinaryOp                   binary_op)
+  {
+    const auto num_items =
+      static_cast<std::size_t>(thrust::distance(first, last));
+    std::size_t storage_size = 0;
+    cudaStream_t stream = cuda_cub::stream(policy);
+
+    using UnwrapInputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t<InputIt>;
+    using UnwrapOutputIt = thrust::detail::try_unwrap_contiguous_iterator_return_t<OutputIt>;
+
+    using InputValueT = thrust::iterator_value_t<UnwrapInputIt>;
+    using OutputValueT = thrust::iterator_value_t<UnwrapOutputIt>;
+
+    constexpr bool can_compare_iterators =
+      std::is_pointer<UnwrapInputIt>::value &&
+      std::is_pointer<UnwrapOutputIt>::value &&
+      std::is_same<InputValueT, OutputValueT>::value;
+
+    auto first_unwrap = thrust::detail::try_unwrap_contiguous_iterator(first);
+    auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+    thrust::detail::integral_constant<bool, can_compare_iterators> comparable;
+
+    cudaError_t status = doit_step(nullptr,
+                                   storage_size,
+                                   first_unwrap,
+                                   result_unwrap,
+                                   binary_op,
+                                   num_items,
+                                   stream,
+                                   comparable);
+    cuda_cub::throw_on_error(status, "adjacent_difference failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+
+    status = doit_step(static_cast<void *>(tmp.data().get()),
+                       storage_size,
+                       first_unwrap,
+                       result_unwrap,
+                       binary_op,
+                       num_items,
+                       stream,
+                       comparable);
+    cuda_cub::throw_on_error(status, "adjacent_difference failed on 2nd step");
+
+    status = cuda_cub::synchronize_optional(policy);
+    cuda_cub::throw_on_error(status, "adjacent_difference failed to synchronize");
+
+    return result + num_items;
+  }
+
+}    // namespace __adjacent_difference
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class BinaryOp>
+OutputIt __host__ __device__
+adjacent_difference(execution_policy<Derived> &policy,
+                    InputIt                    first,
+                    InputIt                    last,
+                    OutputIt                   result,
+                    BinaryOp                   binary_op)
+{
+  THRUST_CDP_DISPATCH(
+    (result = __adjacent_difference::adjacent_difference(policy,
+                                                         first,
+                                                         last,
+                                                         result,
+                                                         binary_op);),
+    (result = thrust::adjacent_difference(cvt_to_seq(derived_cast(policy)),
+                                          first,
+                                          last,
+                                          result,
+                                          binary_op);));
+  return result;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+adjacent_difference(execution_policy<Derived> &policy,
+                    InputIt                    first,
+                    InputIt                    last,
+                    OutputIt                   result)
+{
+  typedef typename iterator_traits<InputIt>::value_type input_type;
+  return cuda_cub::adjacent_difference(policy,
+                                       first,
+                                       last,
+                                       result,
+                                       minus<input_type>());
+}
 
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
 
-#include <thrust/system/cuda/detail/adjacent_difference.inl>
+//
+#include <thrust/memory.h>
+#include <thrust/adjacent_difference.h>
+#endif
 
diff --git a/thrust/system/cuda/detail/adjacent_difference.inl b/thrust/system/cuda/detail/adjacent_difference.inl
deleted file mode 100644
index f18a3d80f..000000000
--- a/thrust/system/cuda/detail/adjacent_difference.inl
+++ /dev/null
@@ -1,258 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/adjacent_difference.h>
-#include <thrust/gather.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/seq.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace adjacent_difference_detail
-{
-
-
-template<typename Decomposition>
-struct last_index_in_each_interval : public thrust::unary_function<typename Decomposition::index_type, typename Decomposition::index_type>
-{
-  typedef typename Decomposition::index_type index_type;
-
-  Decomposition decomp;
-
-  __host__ __device__
-  last_index_in_each_interval(Decomposition decomp) : decomp(decomp) {}
-
-  __host__ __device__
-  index_type operator()(index_type interval)
-  {
-    return decomp[interval].end() - 1;
-  }
-};
-
-
-template <typename InputIterator1,
-          typename InputIterator2,
-          typename OutputIterator,
-          typename BinaryFunction,
-          typename Decomposition,
-          typename Context>
-struct adjacent_difference_closure
-{
-  InputIterator1 input;
-  InputIterator2 input_copy;
-  OutputIterator output;
-  BinaryFunction binary_op;
-  Decomposition  decomp;
-  Context        context;
-
-  typedef Context context_type;
-  
-  __host__ __device__
-  adjacent_difference_closure(InputIterator1 input,
-                              InputIterator2 input_copy,
-                              OutputIterator output,
-                              BinaryFunction binary_op,
-                              Decomposition  decomp,
-                              Context        context = Context())
-    : input(input), input_copy(input_copy), output(output), binary_op(binary_op), decomp(decomp), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<InputIterator1>::type  InputType;
-    typedef typename Decomposition::index_type index_type;
-
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<index_type> range = decomp[context.block_index()];
-    
-    input_copy += context.block_index() - 1;
-      
-    // prime the temp values for all threads so we don't need to launch a default constructor
-    InputType next_left = (context.block_index() == 0) ? thrust::raw_reference_cast(*input) : thrust::raw_reference_cast(*input_copy);
-
-    index_type base = range.begin();
-    index_type i    = range.begin() + context.thread_index();
-    
-    if(i < range.end())
-    {
-      if(context.thread_index() > 0)
-      {
-        InputIterator1 temp = input + (i - 1);
-        next_left = *temp;
-      }              
-    }
-    
-    input  += i;
-    output += i;
-
-    while(base < range.end())
-    {
-      InputType curr_left = next_left;
-
-      if(i + context.block_dimension() < range.end())
-      {
-        InputIterator1 temp = input + (context.block_dimension() - 1);
-        next_left = *temp;
-      }
-
-      context.barrier();
-
-      if(i < range.end())
-      {
-        if(i == 0)
-        {
-          *output = *input;
-        }
-        else
-        {
-          InputType x = *input;
-          *output = binary_op(x, curr_left);
-        }
-      }
-
-      i      += context.block_dimension();
-      base   += context.block_dimension();
-      input  += context.block_dimension();
-      output += context.block_dimension();
-    }
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-__host__ __device__
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type                        InputType;
-  typedef typename thrust::iterator_difference<InputIterator>::type                   IndexType;
-  typedef          thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-
-  IndexType n = last - first;
-
-  if(n == 0)
-  {
-    return result;
-  }
-
-  Decomposition decomp = default_decomposition(last - first);
-
-  // allocate temporary storage
-  thrust::detail::temporary_array<InputType,DerivedPolicy> temp(exec, decomp.size() - 1);
-
-  // gather last value in each interval
-  last_index_in_each_interval<Decomposition> unary_op(decomp);
-  thrust::gather(exec,
-                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op),
-                 thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0), unary_op) + (decomp.size() - 1),
-                 first,
-                 temp.begin());
-
-  
-  typedef typename thrust::detail::temporary_array<InputType,DerivedPolicy>::iterator InputIterator2;
-  typedef detail::blocked_thread_array Context;
-  typedef adjacent_difference_closure<InputIterator,InputIterator2,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
-
-  Closure closure(first, temp.begin(), result, binary_op, decomp); 
-
-  detail::launch_closure(exec, closure, decomp.size());
-  
-  return result + n;
-} // end adjacent_difference()
-
-
-} // end namespace adjacent_difference_detail
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction>
-__host__ __device__
-OutputIterator adjacent_difference(execution_policy<DerivedPolicy> &exec,
-                                   InputIterator first, InputIterator last,
-                                   OutputIterator result,
-                                   BinaryFunction binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static OutputIterator parallel_path(execution_policy<DerivedPolicy> &exec,
-                                        InputIterator first, InputIterator last,
-                                        OutputIterator result,
-                                        BinaryFunction binary_op)
-    {
-      return thrust::system::cuda::detail::adjacent_difference_detail::adjacent_difference(exec, first, last, result, binary_op);
-    }
-
-    __host__ __device__
-    static OutputIterator sequential_path(execution_policy<DerivedPolicy> &,
-                                          InputIterator first, InputIterator last,
-                                          OutputIterator result,
-                                          BinaryFunction binary_op)
-    {
-      return thrust::adjacent_difference(thrust::seq, first, last, result, binary_op);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, last, result, binary_op);
-#else
-  return workaround::sequential_path(exec, first, last, result, binary_op);
-#endif
-}
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/assign_value.h b/thrust/system/cuda/detail/assign_value.h
index d026205db..8945f1cac 100644
--- a/thrust/system/cuda/detail/assign_value.h
+++ b/thrust/system/cuda/detail/assign_value.h
@@ -17,62 +17,18 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#include <thrust/system/cuda/detail/copy.h>
 
+#include <nv/target>
 
-// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
-//     symbols resulting from assign_value
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
 
-namespace
-{
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value_msvc2005_war(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-    {
-      thrust::copy(exec, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
-    {
-      *thrust::raw_pointer_cast(dst) = *thrust::raw_pointer_cast(src);
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  war_nvbugs_881631::host_path(exec,dst,src);
-#else
-  war_nvbugs_881631::device_path(exec,dst,src);
-#endif // __CUDA_ARCH__
-} // end assign_value_msvc2005_war()
-
-} // end anon namespace
-
-template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
-{
-  return assign_value_msvc2005_war(exec,dst,src);
-} // end assign_value()
-
-#else
 
 template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 inline __host__ __device__
@@ -83,7 +39,7 @@ inline __host__ __device__
   {
     __host__ inline static void host_path(thrust::cuda::execution_policy<DerivedPolicy> &exec, Pointer1 dst, Pointer2 src)
     {
-      thrust::copy(exec, src, src + 1, dst);
+      cuda_cub::copy(exec, src, src + 1, dst);
     }
 
     __device__ inline static void device_path(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 dst, Pointer2 src)
@@ -92,70 +48,15 @@ inline __host__ __device__
     }
   };
 
-#ifndef __CUDA_ARCH__
-  war_nvbugs_881631::host_path(exec,dst,src);
-#else
-  war_nvbugs_881631::device_path(exec,dst,src);
-#endif // __CUDA_ARCH__
-} // end assign_value()
-
-#endif // msvc 2005 WAR
-
-
-// XXX WAR an issue with msvc 2005 (cl v14.00) which creates multiply-defined
-//     symbols resulting from assign_value
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-
-namespace
-{
-
-
-template<typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value_msvc2005_war(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-{
-  // XXX war nvbugs/881631
-  struct war_nvbugs_881631
-  {
-    __host__ inline static void host_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // rotate the systems so that they are ordered the same as (src, dst)
-      // for the call to thrust::copy
-      cross_system<System2,System1> rotated_systems = systems.rotate();
-      thrust::copy(rotated_systems, src, src + 1, dst);
-    }
-
-    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-    {
-      // XXX forward the true cuda::execution_policy inside systems here
-      //     instead of materializing a tag
-      thrust::cuda::tag cuda_tag;
-      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
-    }
-  };
-
-#if __CUDA_ARCH__
-  war_nvbugs_881631::device_path(systems,dst,src);
-#else
-  war_nvbugs_881631::host_path(systems,dst,src);
-#endif
-} // end assign_value_msvc2005_war
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(exec,dst,src);
+  ), (
+    war_nvbugs_881631::device_path(exec,dst,src);
+  ));
 
-
-} // end anon namespace
-
-
-template<typename System1, typename System2, typename Pointer1, typename Pointer2>
-inline __host__ __device__
-  void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
-{
-  return assign_value_msvc2005_war(systems,dst,src);
 } // end assign_value()
 
 
-#else
-
-
 template<typename System1, typename System2, typename Pointer1, typename Pointer2>
 inline __host__ __device__
   void assign_value(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
@@ -168,31 +69,26 @@ inline __host__ __device__
       // rotate the systems so that they are ordered the same as (src, dst)
       // for the call to thrust::copy
       cross_system<System2,System1> rotated_systems = systems.rotate();
-      thrust::copy(rotated_systems, src, src + 1, dst);
+      cuda_cub::copy(rotated_systems, src, src + 1, dst);
     }
 
-    __device__ inline static void device_path(cross_system<System1,System2> &systems, Pointer1 dst, Pointer2 src)
+    __device__ inline static void device_path(cross_system<System1,System2> &, Pointer1 dst, Pointer2 src)
     {
       // XXX forward the true cuda::execution_policy inside systems here
       //     instead of materializing a tag
       thrust::cuda::tag cuda_tag;
-      thrust::system::cuda::detail::assign_value(cuda_tag, dst, src);
+      thrust::cuda_cub::assign_value(cuda_tag, dst, src);
     }
   };
 
-#if __CUDA_ARCH__
-  war_nvbugs_881631::device_path(systems,dst,src);
-#else
-  war_nvbugs_881631::host_path(systems,dst,src);
-#endif
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(systems,dst,src);
+  ), (
+    war_nvbugs_881631::device_path(systems,dst,src);
+  ));
 } // end assign_value()
 
 
-#endif // msvc 2005 WAR
-
-  
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
+} // end cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/async/copy.h b/thrust/system/cuda/detail/async/copy.h
new file mode 100644
index 000000000..6f2970759
--- /dev/null
+++ b/thrust/system/cuda/detail/async/copy.h
@@ -0,0 +1,537 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/async/transform.h>
+#include <thrust/system/cuda/detail/cross_system.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/uninitialized_copy.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+// ContiguousIterator input and output iterators
+// TriviallyCopyable elements
+// Host to device, device to host, device to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    is_indirectly_trivially_relocatable_to<ForwardIt, OutputIt>::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(
+    select_device_system(from_exec, to_exec)
+  );
+
+  using pointer
+    = typename thrust::detail::allocator_traits<decltype(device_alloc)>::
+      template rebind_traits<void>::pointer;
+
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(
+    select_device_system(from_exec, to_exec)
+  );
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(from_exec))
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(to_exec))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        extract_dependencies(
+          std::move(thrust::detail::derived_cast(from_exec))
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(to_exec))
+        )
+      )
+    );
+  }
+
+  // Run copy.
+
+  thrust::cuda_cub::throw_on_error(
+    cudaMemcpyAsync(
+      thrust::raw_pointer_cast(&*output)
+    , thrust::raw_pointer_cast(&*first)
+    , sizeof(T) * n
+    , direction_of_copy(from_exec, to_exec)
+    , e.stream().native_handle()
+    )
+  , "after copy launch"
+  );
+
+  return e;
+}
+
+// Non-ContiguousIterator input or output, or non-TriviallyRelocatable value type
+// Device to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&   to_exec
+, ForwardIt                                   first
+, Size                                        n
+, OutputIt                                    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<
+        is_indirectly_trivially_relocatable_to<ForwardIt, OutputIt>
+      >
+    , decltype(is_device_to_device_copy(from_exec, to_exec))
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  return async_transform_n(
+    select_device_system(from_exec, to_exec)
+  , first, n, output, thrust::identity<T>()
+  );
+}
+
+template <typename OutputIt>
+void async_copy_n_compile_failure_no_cuda_to_non_contiguous_output()
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (negation<is_contiguous_iterator<OutputIt>>::value)
+  , "copying to non-ContiguousIterators in another system from the CUDA system "
+    "is not supported; use `THRUST_PROCLAIM_CONTIGUOUS_ITERATOR(Iterator)` to "
+    "indicate that an iterator points to elements that are contiguous in memory."
+  );
+}
+
+// Non-ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Device to host, host to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<is_contiguous_iterator<OutputIt>>
+    , is_trivially_relocatable_to<
+        typename iterator_traits<ForwardIt>::value_type
+      , typename iterator_traits<OutputIt>::value_type
+      >
+    , disjunction<
+        decltype(is_host_to_device_copy(from_exec, to_exec))
+      , decltype(is_device_to_host_copy(from_exec, to_exec))
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  async_copy_n_compile_failure_no_cuda_to_non_contiguous_output<OutputIt>();
+
+  return {};
+}
+
+// Workaround for MSVC's lack of expression SFINAE and also for an NVCC bug.
+// In NVCC, when two SFINAE-enabled overloads are only distinguishable by a
+// part of a SFINAE condition that is in a `decltype`, NVCC thinks they are the
+// same overload and emits an error.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsH2DCopy = decltype(is_host_to_device_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
+>
+struct is_buffered_trivially_relocatable_host_to_device_copy
+  : thrust::integral_constant<
+      bool
+    ,    !is_contiguous_iterator<ForwardIt>::value
+      && is_contiguous_iterator<OutputIt>::value
+      && is_trivially_relocatable_to<
+            typename iterator_traits<ForwardIt>::value_type
+          , typename iterator_traits<OutputIt>::value_type
+          >::value
+      && IsH2DCopy::value
+    >
+{};
+
+// Non-ContiguousIterator input iterator, ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Host to device
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy&                               from_exec
+, thrust::cuda::execution_policy<ToPolicy>& to_exec
+, ForwardIt                                 first
+, Size                                      n
+, OutputIt                                  output
+) ->
+  typename std::enable_if<
+    is_buffered_trivially_relocatable_host_to_device_copy<
+      FromPolicy
+    , thrust::cuda::execution_policy<ToPolicy>
+    , ForwardIt, OutputIt
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const host_alloc = get_async_host_allocator(
+    from_exec
+  );
+
+  // Create host-side buffer.
+
+  auto buffer = uninitialized_allocate_unique_n<T>(host_alloc, n);
+
+  auto const buffer_ptr = buffer.get();
+
+  // Copy into host-side buffer.
+
+  // TODO: Switch to an async call once we have async interfaces for host
+  // systems and support for cross system dependencies.
+  uninitialized_copy_n(from_exec, first, n, buffer_ptr);
+
+  // Run device-side copy.
+
+  auto new_to_exec = thrust::detail::derived_cast(to_exec).rebind_after(
+    std::tuple_cat(
+      std::make_tuple(
+        std::move(buffer)
+      )
+    , extract_dependencies(
+        std::move(thrust::detail::derived_cast(from_exec))
+      )
+    , extract_dependencies(
+        std::move(thrust::detail::derived_cast(to_exec))
+      )
+    )
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(to_exec)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_to_exec)
+    )>::value
+  ));
+
+  return async_copy_n(
+    from_exec
+    // TODO: We have to cast back to the right execution_policy class. Ideally,
+    // we should be moving here.
+  , new_to_exec
+  , buffer_ptr
+  , n
+  , output
+  );
+}
+
+// Workaround for MSVC's lack of expression SFINAE and also for an NVCC bug.
+// In NVCC, when two SFINAE-enabled overloads are only distinguishable by a
+// part of a SFINAE condition that is in a `decltype`, NVCC thinks they are the
+// same overload and emits an error.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt
+  // MSVC2015 WAR: doesn't like decltype(...)::value in superclass definition
+, typename IsD2HCopy = decltype(is_device_to_host_copy(
+    std::declval<FromPolicy const&>()
+  , std::declval<ToPolicy const&>()))
+>
+struct is_buffered_trivially_relocatable_device_to_host_copy
+  : thrust::integral_constant<
+      bool
+    ,    !is_contiguous_iterator<ForwardIt>::value
+      && is_contiguous_iterator<OutputIt>::value
+      && is_trivially_relocatable_to<
+            typename iterator_traits<ForwardIt>::value_type
+          , typename iterator_traits<OutputIt>::value_type
+          >::value
+      && IsD2HCopy::value
+    >
+{};
+
+// Non-ContiguousIterator input iterator, ContiguousIterator output iterator
+// TriviallyRelocatable value type
+// Device to host
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, ToPolicy&                                   to_exec
+, ForwardIt                                   first
+, Size                                        n
+, OutputIt                                    output
+) ->
+  typename std::enable_if<
+    is_buffered_trivially_relocatable_device_to_host_copy<
+      thrust::cuda::execution_policy<FromPolicy>
+    , ToPolicy
+    , ForwardIt, OutputIt
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(
+    from_exec
+  );
+
+  // Create device-side buffer.
+
+  auto buffer = uninitialized_allocate_unique_n<T>(device_alloc, n);
+
+  auto const buffer_ptr = buffer.get();
+
+  // Run device-side copy.
+
+  auto f0 = async_copy_n(
+    from_exec
+  , from_exec
+  , first
+  , n
+  , buffer_ptr
+  );
+
+  // Run copy back to host.
+
+  auto new_from_exec = thrust::detail::derived_cast(from_exec).rebind_after(
+    std::move(buffer)
+  , std::move(f0)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(from_exec)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_from_exec)
+    )>::value
+  ));
+
+  return async_copy_n(
+    new_from_exec
+  , to_exec
+  , buffer_ptr
+  , n
+  , output
+  );
+}
+
+template <typename InputType, typename OutputType>
+void async_copy_n_compile_failure_non_trivially_relocatable_elements()
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (is_trivially_relocatable_to<OutputType, InputType>::value)
+  , "only sequences of TriviallyRelocatable elements can be copied to and from "
+    "the CUDA system; use `THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(T)` to "
+    "indicate that a type can be copied by bitwise (e.g. by `memcpy`)"
+  );
+}
+
+// Non-TriviallyRelocatable value type
+// Host to device, device to host
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename OutputIt, typename Size
+>
+auto async_copy_n(
+  FromPolicy& from_exec
+, ToPolicy&   to_exec
+, ForwardIt   first
+, Size        n
+, OutputIt    output
+) ->
+  typename std::enable_if<
+    conjunction<
+      negation<
+        is_trivially_relocatable_to<
+          typename iterator_traits<ForwardIt>::value_type
+        , typename iterator_traits<OutputIt>::value_type
+        >
+      >
+    , disjunction<
+        decltype(is_host_to_device_copy(from_exec, to_exec))
+      , decltype(is_device_to_host_copy(from_exec, to_exec))
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  // TODO: We could do more here with cudaHostRegister.
+
+  async_copy_n_compile_failure_non_trivially_relocatable_elements<
+    typename thrust::iterator_traits<ForwardIt>::value_type
+  , typename std::add_lvalue_reference<
+      typename thrust::iterator_traits<OutputIt>::value_type
+    >::type
+  >();
+
+  return {};
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cuda::execution_policy<FromPolicy>&         from_exec
+, thrust::cpp::execution_policy<ToPolicy>&            to_exec
+, ForwardIt                                           first
+, Sentinel                                            last
+, OutputIt                                            output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cpp::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&  to_exec
+, ForwardIt                                  first
+, Sentinel                                   last
+, OutputIt                                   output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+// ADL entry point.
+template <
+  typename FromPolicy, typename ToPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+>
+auto async_copy(
+  thrust::cuda::execution_policy<FromPolicy>& from_exec
+, thrust::cuda::execution_policy<ToPolicy>&   to_exec
+, ForwardIt                                   first
+, Sentinel                                    last
+, OutputIt                                    output
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_copy_n(
+    from_exec, to_exec, first, distance(first, last), output
+  )
+)
+
+} // cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/customization.h b/thrust/system/cuda/detail/async/customization.h
new file mode 100644
index 000000000..6f125a6f4
--- /dev/null
+++ b/thrust/system/cuda/detail/async/customization.h
@@ -0,0 +1,127 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/detail/type_deduction.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/execute_with_allocator.h>
+#include <thrust/system/cuda/memory_resource.h>
+#include <thrust/mr/host_memory_resource.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/disjoint_sync_pool.h>
+#include <thrust/mr/sync_pool.h>
+#include <thrust/per_device_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+using default_async_host_resource =
+  thrust::mr::synchronized_pool_resource<
+    thrust::host_memory_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_host_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::mr::stateless_resource_allocator<
+    thrust::detail::uint8_t, default_async_host_resource
+  >{}
+)
+
+///////////////////////////////////////////////////////////////////////////////
+
+using default_async_device_resource =
+  thrust::mr::disjoint_synchronized_pool_resource<
+    thrust::system::cuda::memory_resource
+  , thrust::mr::new_delete_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_device_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::per_device_allocator<
+    thrust::detail::uint8_t, default_async_device_resource, par_t
+  >{}
+)
+
+template <typename Allocator, template <typename> class BaseSystem>
+auto get_async_device_allocator(
+  thrust::detail::execute_with_allocator<Allocator, BaseSystem>& exec
+)
+THRUST_RETURNS(exec.get_allocator())
+
+template <typename Allocator, template <typename> class BaseSystem>
+auto get_async_device_allocator(
+  thrust::detail::execute_with_allocator_and_dependencies<
+    Allocator, BaseSystem
+  >& exec
+)
+THRUST_RETURNS(exec.get_allocator())
+
+///////////////////////////////////////////////////////////////////////////////
+
+using default_async_universal_host_pinned_resource =
+  thrust::mr::synchronized_pool_resource<
+    thrust::system::cuda::universal_host_pinned_memory_resource
+  >;
+
+template <typename DerivedPolicy>
+auto get_async_universal_host_pinned_allocator(
+  thrust::detail::execution_policy_base<DerivedPolicy>&
+)
+THRUST_RETURNS(
+  thrust::mr::stateless_resource_allocator<
+    thrust::detail::uint8_t, default_async_universal_host_pinned_resource
+  >{}
+)
+
+}}} // namespace system::cuda::detail
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/exclusive_scan.h b/thrust/system/cuda/detail/async/exclusive_scan.h
new file mode 100644
index 000000000..0b120a434
--- /dev/null
+++ b/thrust/system/cuda/detail/async/exclusive_scan.h
@@ -0,0 +1,201 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. ExclusiveSum instead of ExcScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+THRUST_NAMESPACE_BEGIN
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+unique_eager_event
+async_exclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                       ForwardIt first,
+                       Size n,
+                       OutputIt out,
+                       InitialValueType init,
+                       BinaryOp op)
+{
+  using InputValueT = cub::detail::InputValue<InitialValueType>;
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InputValueT,
+                                       thrust::detail::int32_t,
+                                       InitialValueType>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       InputValueT,
+                                       thrust::detail::int64_t,
+                                       InitialValueType>;
+
+  InputValueT init_value(init);
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init_value,
+                                  n_fixed,
+                                  nullptr));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  init_value,
+                                  n_fixed,
+                                  user_raw_stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching exclusive_scan kernel");
+  }
+
+  return ev;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename InitialValueType,
+          typename BinaryOp>
+auto async_exclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          InitialValueType &&init,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_exclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(init),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/for_each.h b/thrust/system/cuda/detail/async/for_each.h
new file mode 100644
index 000000000..d6809fe0a
--- /dev/null
+++ b/thrust/system/cuda/detail/async/for_each.h
@@ -0,0 +1,157 @@
+
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <typename ForwardIt, typename UnaryFunction>
+struct async_for_each_fn
+{
+  ForwardIt first;
+  UnaryFunction f;
+
+  __host__ __device__
+  async_for_each_fn(ForwardIt&& first_, UnaryFunction&& f_)
+    : first(std::move(first_)), f(std::move(f_))
+  {}
+
+  template <typename Index>
+  __host__ __device__
+  void operator()(Index idx)
+  {
+    f(thrust::raw_reference_cast(first[idx]));
+  }
+};
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename UnaryFunction
+>
+unique_eager_event async_for_each_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  UnaryFunction                    func
+) {
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      extract_dependencies(
+        std::move(thrust::detail::derived_cast(policy))
+      )
+    );
+  }
+
+  // Run for_each.
+
+  async_for_each_fn<ForwardIt, UnaryFunction> wrapped(
+    std::move(first), std::move(func)
+  );
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__parallel_for::parallel_for(
+      n, std::move(wrapped), e.stream().native_handle()
+    )
+  , "after for_each launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename UnaryFunction
+>
+auto async_for_each(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  UnaryFunction&&                  func
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_for_each_n(
+    policy, first, distance(first, last), THRUST_FWD(func)
+  )
+);
+
+} // cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/inclusive_scan.h b/thrust/system/cuda/detail/async/inclusive_scan.h
new file mode 100644
index 000000000..363347c35
--- /dev/null
+++ b/thrust/system/cuda/detail/async/inclusive_scan.h
@@ -0,0 +1,194 @@
+/******************************************************************************
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/future.h>
+
+#include <thrust/type_traits/remove_cvref.h>
+
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+// TODO specialize for thrust::plus to use e.g. InclusiveSum instead of IncScan
+//  - Note that thrust::plus<> is transparent, cub::Sum is not. This should be
+//    fixed in CUB first).
+//  - Need to check if CUB actually optimizes for sums before putting in effort
+
+THRUST_NAMESPACE_BEGIN
+namespace system
+{
+namespace cuda
+{
+namespace detail
+{
+
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Size,
+          typename OutputIt,
+          typename BinaryOp>
+unique_eager_event
+async_inclusive_scan_n(execution_policy<DerivedPolicy>& policy,
+                       ForwardIt first,
+                       Size n,
+                       OutputIt out,
+                       BinaryOp op)
+{
+  using AccumT = typename thrust::iterator_traits<ForwardIt>::value_type;
+  using Dispatch32 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int32_t,
+                                       AccumT>;
+  using Dispatch64 = cub::DispatchScan<ForwardIt,
+                                       OutputIt,
+                                       BinaryOp,
+                                       cub::NullType,
+                                       thrust::detail::int64_t,
+                                       AccumT>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+  unique_eager_event ev;
+
+  // Determine temporary device storage requirements.
+  cudaError_t status;
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (nullptr,
+                                  tmp_size,
+                                  first,
+                                  out,
+                                  op,
+                                  cub::NullType{},
+                                  n_fixed,
+                                  nullptr));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan");
+  }
+
+  // Allocate temporary storage.
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+  void* const tmp_ptr = raw_pointer_cast(content.get());
+
+  // Set up stream with dependencies.
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content),
+          unique_stream(nonowning, user_raw_stream)
+        ),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+  else
+  {
+    ev = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(std::move(content)),
+        extract_dependencies(std::move(thrust::detail::derived_cast(policy)))));
+  }
+
+  // Run scan.
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                n,
+                                (tmp_ptr,
+                                 tmp_size,
+                                 first,
+                                 out,
+                                 op,
+                                 cub::NullType{},
+                                 n_fixed,
+                                 user_raw_stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching inclusive_scan kernel");
+  }
+
+  return ev;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <typename DerivedPolicy,
+          typename ForwardIt,
+          typename Sentinel,
+          typename OutputIt,
+          typename BinaryOp>
+auto async_inclusive_scan(execution_policy<DerivedPolicy>& policy,
+                          ForwardIt first,
+                          Sentinel&& last,
+                          OutputIt&& out,
+                          BinaryOp&& op)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_inclusive_scan_n(
+    policy,
+    first,
+    distance(first, THRUST_FWD(last)),
+    THRUST_FWD(out),
+    THRUST_FWD(op)
+  )
+)
+
+} // namespace cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/async/reduce.h b/thrust/system/cuda/detail/async/reduce.h
new file mode 100644
index 000000000..2d0dbfe16
--- /dev/null
+++ b/thrust/system/cuda/detail/async/reduce.h
@@ -0,0 +1,343 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Optimize for thrust::plus
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename T, typename BinaryOp
+>
+unique_eager_future<remove_cvref_t<T>> async_reduce_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, T                                init
+, BinaryOp                         op
+) {
+  using U = remove_cvref_t<T>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  using pointer
+    = typename thrust::detail::allocator_traits<decltype(device_alloc)>::
+      template rebind_traits<U>::pointer;
+
+  unique_eager_future_promise_pair<U, pointer> fp;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<U*>(nullptr)
+    , n
+    , op
+    , init
+    , nullptr // Null stream, just for sizing.
+    )
+  , "after reduction sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, sizeof(U) + tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+  U* const ret_ptr = thrust::detail::aligned_reinterpret_cast<U*>(
+    raw_pointer_cast(content_ptr)
+  );
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr + sizeof(U))
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    fp = make_dependent_future<U, pointer>(
+      [] (decltype(content) const& c)
+      {
+        return pointer(
+          thrust::detail::aligned_reinterpret_cast<U*>(
+            raw_pointer_cast(c.get())
+          )
+        );
+      }
+    , std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    fp = make_dependent_future<U, pointer>(
+      [] (decltype(content) const& c)
+      {
+        return pointer(
+          thrust::detail::aligned_reinterpret_cast<U*>(
+            raw_pointer_cast(c.get())
+          )
+        );
+      }
+    , std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run reduction.
+
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      tmp_ptr
+    , tmp_size
+    , first
+    , ret_ptr
+    , n
+    , op
+    , init
+    , fp.future.stream().native_handle()
+    )
+  , "after reduction launch"
+  );
+
+  return std::move(fp.future);
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename T, typename BinaryOp
+>
+auto async_reduce(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Sentinel                         last
+, T                                init
+, BinaryOp                         op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_reduce_n(
+    policy, first, distance(first, last), init, op
+  )
+)
+
+} // cuda_cub
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename OutputIt
+, typename T, typename BinaryOp
+>
+unique_eager_event async_reduce_into_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, OutputIt                         output
+, T                                init
+, BinaryOp                         op
+) {
+  using U = remove_cvref_t<T>;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<U*>(nullptr)
+    , n
+    , op
+    , init
+    , nullptr // Null stream, just for sizing.
+    )
+  , "after reduction sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run reduction.
+
+  thrust::cuda_cub::throw_on_error(
+    cub::DeviceReduce::Reduce(
+      tmp_ptr
+    , tmp_size
+    , first
+    , output
+    , n
+    , op
+    , init
+    , e.stream().native_handle()
+    )
+  , "after reduction launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename T, typename BinaryOp
+>
+auto async_reduce_into(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Sentinel                         last
+, OutputIt                         output
+, T                                init
+, BinaryOp                         op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_reduce_into_n(
+    policy, first, distance(first, last), output, init, op
+  )
+)
+
+} // cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/cub/util_namespace.cuh b/thrust/system/cuda/detail/async/scan.h
similarity index 69%
rename from thrust/system/cuda/detail/cub/util_namespace.cuh
rename to thrust/system/cuda/detail/async/scan.h
index 52be7c213..4a9f31681 100644
--- a/thrust/system/cuda/detail/cub/util_namespace.cuh
+++ b/thrust/system/cuda/detail/async/scan.h
@@ -1,6 +1,5 @@
 /******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
@@ -13,10 +12,10 @@
  *       names of its contributors may be used to endorse or promote products
  *       derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
@@ -26,21 +25,9 @@
  *
  ******************************************************************************/
 
-/**
- * \file
- * Place-holder for prefixing the cub namespace
- */
-
 #pragma once
 
-// For example:
-//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
-//#define CUB_NS_POSTFIX } }
-
-#ifndef CUB_NS_PREFIX
-#define CUB_NS_PREFIX
-#endif
+#include <thrust/detail/cpp14_required.h>
 
-#ifndef CUB_NS_POSTFIX
-#define CUB_NS_POSTFIX
-#endif
+#include <thrust/system/cuda/detail/async/exclusive_scan.h>
+#include <thrust/system/cuda/detail/async/inclusive_scan.h>
diff --git a/thrust/system/cuda/detail/async/sort.h b/thrust/system/cuda/detail/async/sort.h
new file mode 100644
index 000000000..f501f19c5
--- /dev/null
+++ b/thrust/system/cuda/detail/async/sort.h
@@ -0,0 +1,517 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/async/copy.h>
+#include <thrust/system/cuda/detail/sort.h>
+#include <thrust/detail/alignment.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+#include <thrust/type_traits/is_operator_less_or_greater_function_object.h>
+#include <thrust/type_traits/logical_metafunctions.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/distance.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+// Non-ContiguousIterator input and output iterators
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    negation<is_contiguous_iterator<ForwardIt>>::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  // Create device-side buffer.
+
+  // FIXME: Combine this temporary allocation with the main one for CUB.
+  auto device_buffer = uninitialized_allocate_unique_n<T>(device_alloc, n);
+
+  auto const device_buffer_ptr = device_buffer.get();
+
+  // Synthesize a suitable new execution policy, because we don't want to
+  // try and extract twice from the one we were passed.
+  typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+
+  // Copy from the input into the buffer.
+
+  auto new_policy0 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(device_buffer)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy0)
+    )>::value
+  ));
+
+  auto f0 = async_copy_n(
+    new_policy0
+  , tag_policy
+  , first
+  , n
+  , device_buffer_ptr
+  );
+
+  // Sort the buffer.
+
+  auto new_policy1 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(f0)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy1)
+    )>::value
+  ));
+
+  auto f1 = async_sort_n(
+    new_policy1
+  , tag_policy
+  , device_buffer_ptr
+  , n
+  , comp
+  );
+
+  // Copy from the buffer into the input.
+  // FIXME: Combine this with the potential memcpy at the end of the main sort
+  // routine.
+
+  auto new_policy2 = thrust::detail::derived_cast(policy).rebind_after(
+    std::move(f1)
+  );
+
+  THRUST_STATIC_ASSERT((
+    std::tuple_size<decltype(
+      extract_dependencies(policy)
+    )>::value + 1
+    <=
+    std::tuple_size<decltype(
+      extract_dependencies(new_policy2)
+    )>::value
+  ));
+
+  return async_copy_n(
+    new_policy2
+  , tag_policy
+  , device_buffer_ptr
+  , n
+  , first
+  );
+}
+
+// ContiguousIterator iterators
+// Non-Scalar value type or user-defined StrictWeakOrdering
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    conjunction<
+      is_contiguous_iterator<ForwardIt>
+    , disjunction<
+        negation<
+          std::is_scalar<
+            typename iterator_traits<ForwardIt>::value_type
+          >
+        >
+      , negation<
+          is_operator_less_or_greater_function_object<StrictWeakOrdering>
+        >
+      >
+    >::value
+  , unique_eager_event
+  >::type
+{
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__merge_sort::doit_step<
+      /* Sort items? */ std::false_type, /* Stable? */ std::true_type
+    >(
+      nullptr
+    , tmp_size
+    , first
+    , static_cast<thrust::detail::uint8_t*>(nullptr) // Items.
+    , n
+    , comp
+    , nullptr // Null stream, just for sizing.
+    )
+  , "after merge sort sizing"
+  );
+
+  // Allocate temporary storage.
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run merge sort.
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__merge_sort::doit_step<
+      /* Sort items? */ std::false_type, /* Stable? */ std::true_type
+    >(
+      tmp_ptr
+    , tmp_size
+    , first
+    , static_cast<thrust::detail::uint8_t*>(nullptr) // Items.
+    , n
+    , comp
+    , e.stream().native_handle()
+    )
+  , "after merge sort sizing"
+  );
+
+  return e;
+}
+
+template <typename T, typename Size, typename StrictWeakOrdering>
+typename std::enable_if<
+  is_operator_less_function_object<StrictWeakOrdering>::value
+, cudaError_t
+>::type
+invoke_radix_sort(
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
+, StrictWeakOrdering
+)
+{
+  return cub::DeviceRadixSort::SortKeys(
+    tmp_ptr
+  , tmp_size
+  , keys
+  , n
+  , 0
+  , sizeof(T) * 8
+  , stream
+  );
+}
+
+template <typename T, typename Size, typename StrictWeakOrdering>
+typename std::enable_if<
+  is_operator_greater_function_object<StrictWeakOrdering>::value
+, cudaError_t
+>::type
+invoke_radix_sort(
+  cudaStream_t          stream
+, void*                 tmp_ptr
+, std::size_t&          tmp_size
+, cub::DoubleBuffer<T>& keys
+, Size&                 n
+, StrictWeakOrdering
+)
+{
+  return cub::DeviceRadixSort::SortKeysDescending(
+    tmp_ptr
+  , tmp_size
+  , keys
+  , n
+  , 0
+  , sizeof(T) * 8
+  , stream
+  );
+}
+
+// ContiguousIterator iterators
+// Scalar value type
+// operator< or operator>
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename StrictWeakOrdering
+>
+auto async_stable_sort_n(
+  execution_policy<DerivedPolicy>& policy
+, ForwardIt                        first
+, Size                             n
+, StrictWeakOrdering               comp
+) ->
+  typename std::enable_if<
+    conjunction<
+      is_contiguous_iterator<ForwardIt>
+    , std::is_scalar<
+        typename iterator_traits<ForwardIt>::value_type
+      >
+    , is_operator_less_or_greater_function_object<StrictWeakOrdering>
+    >::value
+  , unique_eager_event
+  >::type
+{
+  using T = typename iterator_traits<ForwardIt>::value_type;
+
+  auto const device_alloc = get_async_device_allocator(policy);
+
+  unique_eager_event e;
+
+  cub::DoubleBuffer<T> keys(
+    raw_pointer_cast(&*first), nullptr
+  );
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+  thrust::cuda_cub::throw_on_error(
+    invoke_radix_sort(
+      nullptr // Null stream, just for sizing.
+    , nullptr
+    , tmp_size
+    , keys
+    , n
+    , comp
+    )
+  , "after radix sort sizing"
+  );
+
+  // Allocate temporary storage.
+
+  size_t keys_temp_storage = thrust::detail::aligned_storage_size(
+    sizeof(T) * n, 128
+  );
+
+  auto content = uninitialized_allocate_unique_n<thrust::detail::uint8_t>(
+    device_alloc, keys_temp_storage + tmp_size
+  );
+
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  auto const content_ptr = content.get();
+
+  keys.d_buffers[1] = thrust::detail::aligned_reinterpret_cast<T*>(
+    raw_pointer_cast(content_ptr)
+  );
+
+  void* const tmp_ptr = static_cast<void*>(
+    raw_pointer_cast(content_ptr + keys_temp_storage)
+  );
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        , unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          std::move(content)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+
+  // Run radix sort.
+
+  thrust::cuda_cub::throw_on_error(
+    invoke_radix_sort(
+      e.stream().native_handle()
+    , tmp_ptr
+    , tmp_size
+    , keys
+    , n
+    , comp
+    )
+  , "after radix sort launch"
+  );
+
+  if (0 != keys.selector)
+  {
+    auto new_policy0 = thrust::detail::derived_cast(policy).rebind_after(
+      std::move(e)
+    );
+
+    THRUST_STATIC_ASSERT((
+      std::tuple_size<decltype(
+        extract_dependencies(policy)
+      )>::value + 1
+      <=
+      std::tuple_size<decltype(
+        extract_dependencies(new_policy0)
+      )>::value
+    ));
+
+    // Synthesize a suitable new execution policy, because we don't want to
+    // try and extract twice from the one we were passed.
+    typename remove_cvref_t<decltype(policy)>::tag_type tag_policy{};
+
+    using return_future = decltype(e);
+    return return_future(async_copy_n(
+      new_policy0
+    , tag_policy
+    , keys.d_buffers[1]
+    , n
+    , keys.d_buffers[0]
+    ));
+  }
+  else
+    return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename StrictWeakOrdering
+>
+auto async_stable_sort(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  StrictWeakOrdering               comp
+)
+// A GCC 5 bug requires an explicit trailing return type here, so stick with
+// THRUST_DECLTYPE_RETURNS for now.
+THRUST_DECLTYPE_RETURNS(
+  thrust::system::cuda::detail::async_stable_sort_n(
+    policy, first, distance(first, last), comp
+  )
+)
+
+} // cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/async/transform.h b/thrust/system/cuda/detail/async/transform.h
new file mode 100644
index 000000000..a971300f2
--- /dev/null
+++ b/thrust/system/cuda/detail/async/transform.h
@@ -0,0 +1,161 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+// TODO: Move into system::cuda
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/async/customization.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda { namespace detail
+{
+
+template <typename ForwardIt, typename OutputIt, typename UnaryOperation>
+struct async_transform_fn
+{
+  ForwardIt first_;
+  OutputIt output_;
+  UnaryOperation op_;
+
+  __host__ __device__
+  async_transform_fn(ForwardIt&& first, OutputIt&& output, UnaryOperation&& op)
+    : first_(std::move(first)), output_(std::move(output)), op_(std::move(op))
+  {}
+
+  template <typename Index>
+  __host__ __device__
+  void operator()(Index idx)
+  {
+    output_[idx] = op_(thrust::raw_reference_cast(first_[idx]));
+  }
+};
+
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Size, typename OutputIt, typename UnaryOperation
+>
+unique_eager_event async_transform_n(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Size                             n,
+  OutputIt                         output,
+  UnaryOperation                   op
+) {
+  unique_eager_event e;
+
+  // Set up stream with dependencies.
+
+  cudaStream_t const user_raw_stream = thrust::cuda_cub::stream(policy);
+
+  if (thrust::cuda_cub::default_stream() != user_raw_stream)
+  {
+    e = make_dependent_event(
+      std::tuple_cat(
+        std::make_tuple(
+          unique_stream(nonowning, user_raw_stream)
+        )
+      , extract_dependencies(
+          std::move(thrust::detail::derived_cast(policy))
+        )
+      )
+    );
+  }
+  else
+  {
+    e = make_dependent_event(
+      extract_dependencies(
+        std::move(thrust::detail::derived_cast(policy))
+      )
+    );
+  }
+
+  // Run transform.
+
+  async_transform_fn<ForwardIt, OutputIt, UnaryOperation> wrapped(
+    std::move(first), std::move(output), std::move(op)
+  );
+
+  thrust::cuda_cub::throw_on_error(
+    thrust::cuda_cub::__parallel_for::parallel_for(
+      n, std::move(wrapped), e.stream().native_handle()
+    )
+  , "after transform launch"
+  );
+
+  return e;
+}
+
+}}} // namespace system::cuda::detail
+
+namespace cuda_cub
+{
+
+// ADL entry point.
+template <
+  typename DerivedPolicy
+, typename ForwardIt, typename Sentinel, typename OutputIt
+, typename UnaryOperation
+>
+auto async_transform(
+  execution_policy<DerivedPolicy>& policy,
+  ForwardIt                        first,
+  Sentinel                         last,
+  OutputIt                         output,
+  UnaryOperation&&                 op
+)
+THRUST_RETURNS(
+  thrust::system::cuda::detail::async_transform_n(
+    policy, first, distance(first, last), output, THRUST_FWD(op)
+  )
+);
+
+} // cuda_cub
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#endif
+
diff --git a/thrust/system/cuda/detail/binary_search.h b/thrust/system/cuda/detail/binary_search.h
index c6ae90664..fb769a4ac 100644
--- a/thrust/system/cuda/detail/binary_search.h
+++ b/thrust/system/cuda/detail/binary_search.h
@@ -1,22 +1,19 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+*  Copyright 2021 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
 
 #pragma once
 
-#include <thrust/detail/config.h>
-
-// this system has no special version of this algorithm 
-
+// this system has no special version of this algorithm
diff --git a/thrust/system/cuda/detail/block/copy.h b/thrust/system/cuda/detail/block/copy.h
deleted file mode 100644
index 6d02c52d1..000000000
--- a/thrust/system/cuda/detail/block/copy.h
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file copy.h
- *  \brief CUDA implementation of device-to-device copy,
- *         based on Gregory Diamos' memcpy code.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/pair.h>
-
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/dispatch/is_trivial_copy.h>
-#include <thrust/detail/raw_reference_cast.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-namespace trivial_copy_detail
-{
-
-
-template<typename Size>
-  inline __device__ thrust::pair<Size,Size> quotient_and_remainder(Size n, Size d)
-{
-  Size quotient  = n / d;
-  Size remainder = n - d * quotient; 
-  return thrust::make_pair(quotient,remainder);
-} // end quotient_and_remainder()
-
-
-// assumes the addresses dst & src are aligned to T boundaries
-template<typename Context,
-         typename T>
-__device__ __thrust_forceinline__
-void aligned_copy(Context context, T *dst, const T *src, unsigned int num_elements)
-{
-  for(unsigned int i = context.thread_index();
-      i < num_elements;
-      i += context.block_dimension())
-  {
-    dst[i] = src[i];
-  }
-} // end aligned_copy()
-
-
-} // end namespace trivial_copy_detail
-
-
-template <typename Context>
-__device__ __thrust_forceinline__
-void trivial_copy(Context context, void* destination_, const void* source_, size_t num_bytes)
-{
-  // reinterpret at bytes
-  char* destination  = reinterpret_cast<char*>(destination_);
-  const char* source = reinterpret_cast<const char*>(source_);
- 
-  // TODO replace this with uint64
-#if THRUST_DEVICE_COMPILER != THRUST_DEVICE_COMPILER_NVCC
-  typedef long long  int2;
-  typedef long long uint2;
-#endif // THRUST_DEVICE_COMPILER_NVCC
-
-  // check alignment
-  // XXX can we do this in three steps?
-  //     1. copy until alignment is met
-  //     2. go hog wild
-  //     3. get the remainder
-  if(reinterpret_cast<size_t>(destination) % sizeof(uint2) != 0 || reinterpret_cast<size_t>(source) % sizeof(uint2) != 0)
-  {
-    for(unsigned int i = context.thread_index(); i < num_bytes; i += context.block_dimension())
-    {
-      destination[i] = source[i];
-    }
-  }
-  else
-  {
-    // it's aligned; do a wide copy
-
-    // this pair stores the number of int2s in the aligned portion of the arrays
-    // and the number of bytes in the remainder
-    const thrust::pair<size_t,size_t> num_wide_elements_and_remainder_bytes = trivial_copy_detail::quotient_and_remainder(num_bytes, sizeof(int2));
-
-    // copy int2 elements
-    trivial_copy_detail::aligned_copy(context,
-                                      reinterpret_cast<int2*>(destination),
-                                      reinterpret_cast<const int2*>(source),
-                                      num_wide_elements_and_remainder_bytes.first);
-
-    // XXX we could copy int elements here
-
-    // copy remainder byte by byte
-
-    // to find the beginning of the remainder arrays, we need to point at the beginning, and then skip the number of bytes in the aligned portion
-    // this is sizeof(int2) times the number of int2s comprising the aligned portion
-    const char *remainder_first  = reinterpret_cast<const char*>(source + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
-          char *remainder_result = reinterpret_cast<char*>(destination  + sizeof(int2) * num_wide_elements_and_remainder_bytes.first);
-
-    trivial_copy_detail::aligned_copy(context, remainder_result, remainder_first, num_wide_elements_and_remainder_bytes.second);
-  }
-} // end trivial_copy()
-
-
-namespace detail
-{
-namespace dispatch
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context,
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result,
-                             thrust::detail::true_type is_trivial_copy)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-  const T *src = &thrust::raw_reference_cast(*first);
-        T *dst = &thrust::raw_reference_cast(*result);
-
-  size_t n = (last - first);
-  thrust::system::cuda::detail::block::trivial_copy(context, dst, src, n * sizeof(T));
-  return result + n;
-} // end copy()
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context, 
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result,
-                             thrust::detail::false_type is_trivial_copy)
-{
-  RandomAccessIterator2 end_of_output = result + (last - first);
-  
-  // advance iterators
-  first  += context.thread_index();
-  result += context.thread_index();
-
-  for(;
-      first < last;
-      first  += context.block_dimension(),
-      result += context.block_dimension())
-  {
-    *result = *first;
-  } // end for
-
-  return end_of_output;
-} // end copy()
-
-} // end namespace dispatch
-} // end namespace detail
-
-template<typename Context, 
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  __thrust_forceinline__ __device__
-  RandomAccessIterator2 copy(Context context,
-                             RandomAccessIterator1 first,
-                             RandomAccessIterator1 last,
-                             RandomAccessIterator2 result)
-{
-  return detail::dispatch::copy(context, first, last, result,
-#if __CUDA_ARCH__ < 200
-      // does not work reliably on pre-Fermi due to "Warning: ... assuming global memory space" issues
-      thrust::detail::false_type()
-#else
-      typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type()
-#endif
-      );
-} // end copy()
-
-
-template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-inline __device__
-RandomAccessIterator2 async_copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  for(Size i = ctx.thread_index(); i < n; i += ctx.block_dimension())
-  {
-    result[i] = first[i];
-  }
-
-  return result + n;
-}
-
-
-template<typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-inline __device__
-RandomAccessIterator2 copy_n(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  result = async_copy_n(ctx, first, n, result);
-  ctx.barrier();
-
-  return result;
-}
-
-
-template<unsigned int work_per_thread, typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-inline __device__
-RandomAccessIterator2 async_copy_n_global_to_shared(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-  // stage copy through registers
-  value_type reg[work_per_thread];
-
-  // avoid conditional accesses when possible
-  if(n >= ctx.block_dimension() * work_per_thread)
-  {
-    for(unsigned int i = 0; i < work_per_thread; ++i)
-    {
-      unsigned int idx = ctx.block_dimension() * i + ctx.thread_index();
-
-      reg[i] = first[idx];
-    }
-  }
-  else
-  {
-    for(unsigned int i = 0; i < work_per_thread; ++i)
-    {
-      unsigned int idx = ctx.block_dimension() * i + ctx.thread_index();
-
-      if(idx < n) reg[i] = first[idx];
-    }
-  }
-
-  // avoid conditional accesses when possible
-  if(n >= ctx.block_dimension() * work_per_thread)
-  {
-    for(unsigned int i = 0; i < work_per_thread; ++i)
-    {
-      unsigned int idx = ctx.block_dimension() * i + ctx.thread_index();
-
-      result[idx] = reg[i];
-    }
-  }
-  else
-  {
-    for(unsigned int i = 0; i < work_per_thread; ++i)
-    {
-      unsigned int idx = ctx.block_dimension() * i + ctx.thread_index();
-
-      if(idx < n) result[idx] = reg[i];
-    }
-  }
-
-  return result + n;
-}
-
-
-template<unsigned int work_per_thread, typename Context, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-__device__
-RandomAccessIterator2 copy_n_global_to_shared(Context &ctx, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  result = async_copy_n_global_to_shared<work_per_thread>(ctx, first, n, result);
-
-  ctx.barrier();
-
-  return result + n;
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/exclusive_scan.h b/thrust/system/cuda/detail/block/exclusive_scan.h
deleted file mode 100644
index b287bb021..000000000
--- a/thrust/system/cuda/detail/block/exclusive_scan.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/functional.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-template<typename Context, typename RandomAccessIterator, typename T, typename BinaryFunction>
-inline __device__
-typename thrust::iterator_value<RandomAccessIterator>::type
-  inplace_exclusive_scan(Context &ctx, RandomAccessIterator first, T init, BinaryFunction op)
-{
-  // perform an inclusive scan, then shift right
-  block::inplace_inclusive_scan(ctx, first, op);
-
-  typename thrust::iterator_value<RandomAccessIterator>::type carry = first[ctx.block_dimension() - 1];
-
-  ctx.barrier();
-
-  typename thrust::iterator_value<RandomAccessIterator>::type left = (ctx.thread_index() == 0) ? init : first[ctx.thread_index() - 1];
-
-  ctx.barrier();
-
-  first[ctx.thread_index()] = left;
-
-  ctx.barrier();
-
-  return carry;
-}
-
-
-template<typename Context, typename Iterator, typename T>
-inline __device__
-  typename thrust::iterator_value<Iterator>::type
-    inplace_exclusive_scan(Context &ctx, Iterator first, T init)
-{
-  return block::inplace_exclusive_scan(ctx, first, init, thrust::plus<typename thrust::iterator_value<Iterator>::type>());
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/inclusive_scan.h b/thrust/system/cuda/detail/block/inclusive_scan.h
deleted file mode 100644
index 27ed65a73..000000000
--- a/thrust/system/cuda/detail/block/inclusive_scan.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename InputIterator,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan(Context context,
-                    InputIterator first,
-                    BinaryFunction binary_op)
-{
-  // TODO generalize to arbitrary n
-  // TODO support dynamic block_size
-  const unsigned int block_size = Context::ThreadsPerBlock::value;
-
-  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
-
-  if(block_size >    1) { if (context.thread_index() >=    1) { val = binary_op(first[context.thread_index() -    1], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    2) { if (context.thread_index() >=    2) { val = binary_op(first[context.thread_index() -    2], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); } 
-  if(block_size >    4) { if (context.thread_index() >=    4) { val = binary_op(first[context.thread_index() -    4], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    8) { if (context.thread_index() >=    8) { val = binary_op(first[context.thread_index() -    8], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   16) { if (context.thread_index() >=   16) { val = binary_op(first[context.thread_index() -   16], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   32) { if (context.thread_index() >=   32) { val = binary_op(first[context.thread_index() -   32], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   64) { if (context.thread_index() >=   64) { val = binary_op(first[context.thread_index() -   64], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  128) { if (context.thread_index() >=  128) { val = binary_op(first[context.thread_index() -  128], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  256) { if (context.thread_index() >=  256) { val = binary_op(first[context.thread_index() -  256], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  512) { if (context.thread_index() >=  512) { val = binary_op(first[context.thread_index() -  512], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-  if(block_size > 1024) { if (context.thread_index() >= 1024) { val = binary_op(first[context.thread_index() - 1024], val); } context.barrier(); first[context.thread_index()] = val; context.barrier(); }
-} // end inclusive_scan()
-
-
-template<typename Context,
-         typename InputIterator,
-         typename Size,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_n(Context context,
-                      InputIterator first,
-                      Size n,
-                      BinaryFunction binary_op)
-{
-  // TODO support n > context.block_dimension()
-  typename thrust::iterator_value<InputIterator>::type val = first[context.thread_index()];
-
-  for (unsigned int i = 1; i < n; i <<= 1)
-  {
-    if (context.thread_index() < n && context.thread_index() >= i)
-      val = binary_op(first[context.thread_index() - i], val);
-
-    context.barrier();
-    
-    first[context.thread_index()] = val;
-    
-    context.barrier();
-  }
-} // end inclusive_scan()
-
-
-template<typename Context,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_by_flag(Context context,
-                            InputIterator1 first1,
-                            InputIterator2 first2,
-                            BinaryFunction binary_op)
-{
-  // TODO generalize to arbitrary n
-  // TODO support dynamic block_size
-  const unsigned int block_size = Context::ThreadsPerBlock::value;
-
-  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
-  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
-
-  if(block_size >    1) { if (context.thread_index() >=    1) { if (!flg) { flg |= first1[context.thread_index() -    1]; val = binary_op(first2[context.thread_index() -    1], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    2) { if (context.thread_index() >=    2) { if (!flg) { flg |= first1[context.thread_index() -    2]; val = binary_op(first2[context.thread_index() -    2], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); } 
-  if(block_size >    4) { if (context.thread_index() >=    4) { if (!flg) { flg |= first1[context.thread_index() -    4]; val = binary_op(first2[context.thread_index() -    4], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >    8) { if (context.thread_index() >=    8) { if (!flg) { flg |= first1[context.thread_index() -    8]; val = binary_op(first2[context.thread_index() -    8], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   16) { if (context.thread_index() >=   16) { if (!flg) { flg |= first1[context.thread_index() -   16]; val = binary_op(first2[context.thread_index() -   16], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   32) { if (context.thread_index() >=   32) { if (!flg) { flg |= first1[context.thread_index() -   32]; val = binary_op(first2[context.thread_index() -   32], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >   64) { if (context.thread_index() >=   64) { if (!flg) { flg |= first1[context.thread_index() -   64]; val = binary_op(first2[context.thread_index() -   64], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  128) { if (context.thread_index() >=  128) { if (!flg) { flg |= first1[context.thread_index() -  128]; val = binary_op(first2[context.thread_index() -  128], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  256) { if (context.thread_index() >=  256) { if (!flg) { flg |= first1[context.thread_index() -  256]; val = binary_op(first2[context.thread_index() -  256], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size >  512) { if (context.thread_index() >=  512) { if (!flg) { flg |= first1[context.thread_index() -  512]; val = binary_op(first2[context.thread_index() -  512], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-  if(block_size > 1024) { if (context.thread_index() >= 1024) { if (!flg) { flg |= first1[context.thread_index() - 1024]; val = binary_op(first2[context.thread_index() - 1024], val); } } context.barrier(); first1[context.thread_index()] = flg; first2[context.thread_index()] = val; context.barrier(); }
-} // end inclusive_scan_by_flag()
-
-
-template<typename Context,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename Size,
-         typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inclusive_scan_by_flag_n(Context context,
-                              InputIterator1 first1,
-                              InputIterator2 first2,
-                              Size n,
-                              BinaryFunction binary_op)
-{
-  // TODO support n > context.block_dimension()
-  typename thrust::iterator_value<InputIterator1>::type flg = first1[context.thread_index()];
-  typename thrust::iterator_value<InputIterator2>::type val = first2[context.thread_index()];
-  
-  for (unsigned int i = 1; i < n; i <<= 1)
-  {
-    if (context.thread_index() < n && context.thread_index() >= i) 
-    {
-      if (!flg)
-      { 
-        flg |= first1[context.thread_index() - i];
-        val  = binary_op(first2[context.thread_index() - i], val);
-      }
-    }
-
-    context.barrier();
-    
-    first1[context.thread_index()] = flg;
-    first2[context.thread_index()] = val;
-    
-    context.barrier();
-  }
-} // end inclusive_scan_by_flag()
-
-
-template<typename Context, typename RandomAccessIterator, typename BinaryFunction>
-__device__ __thrust_forceinline__
-void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first, BinaryFunction op)
-{
-  typename thrust::iterator_value<RandomAccessIterator>::type x = first[ctx.thread_index()];
-
-  for(unsigned int offset = 1; offset < ctx.block_dimension(); offset *= 2)
-  {
-    if(ctx.thread_index() >= offset)
-    {
-      x = op(first[ctx.thread_index() - offset], x);
-    }
-
-    ctx.barrier();
-
-    first[ctx.thread_index()] = x;
-
-    ctx.barrier();
-  }
-}
-
-
-template<typename Context, typename RandomAccessIterator>
-__device__ __thrust_forceinline__
-void inplace_inclusive_scan(Context &ctx, RandomAccessIterator first)
-{
-  block::inplace_inclusive_scan(ctx, first, thrust::plus<typename thrust::iterator_value<RandomAccessIterator>::type>());
-}
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/merge.h b/thrust/system/cuda/detail/block/merge.h
deleted file mode 100644
index deedcb22f..000000000
--- a/thrust/system/cuda/detail/block/merge.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  RandomAccessIterator3 merge(Context context,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              RandomAccessIterator2 last2,
-                              RandomAccessIterator3 result,
-                              StrictWeakOrdering comp);
-
-// XXX assumes that context.block_dimension() <= n1 and
-//                  context.block_dimension() <= n2
-// This algorithm is analogous to inplace_merge
-// but instead of working on the ranges
-// [first, middle) and [middle, last)
-// it works on the ranges
-// [first, first + n1) and [first + n1, first + n1 + n2)
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size1,
-         typename Size2,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  void inplace_merge_by_key_n(Context context,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator2 values_first,
-                              Size1 n1,
-                              Size2 n2,
-                              StrictWeakOrdering comp);
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/block/merge.inl>
-
diff --git a/thrust/system/cuda/detail/block/merge.inl b/thrust/system/cuda/detail/block/merge.inl
deleted file mode 100644
index bc0e43608..000000000
--- a/thrust/system/cuda/detail/block/merge.inl
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/raw_reference_cast.h>
-#include <thrust/system/detail/generic/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  RandomAccessIterator3 merge(Context context,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              RandomAccessIterator2 last2,
-                              RandomAccessIterator3 result,
-                              StrictWeakOrdering comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference1;
-  typedef typename thrust::iterator_difference<RandomAccessIterator2>::type difference2;
-
-  difference1 n1 = last1 - first1;
-  difference2 n2 = last2 - first2;
-
-  // find the rank of each element in the other array
-  difference2 rank2 = 0;
-  if(context.thread_index() < n1)
-  {
-    RandomAccessIterator1 x = first1;
-    x += context.thread_index();
-
-    // lower_bound ensures that x sorts before any equivalent element of input2
-    // this ensures stability
-    rank2 = thrust::system::detail::generic::scalar::lower_bound(first2, last2, raw_reference_cast(*x), comp) - first2;
-  } // end if
-
-  difference1 rank1 = 0;
-  if(context.thread_index() < n2)
-  {
-    RandomAccessIterator2 x = first2 + context.thread_index();
-
-    // upper_bound ensures that x sorts before any equivalent element of input1
-    // this ensures stability
-    rank1 = thrust::system::detail::generic::scalar::upper_bound(first1, last1, raw_reference_cast(*x), comp) - first1;
-  } // end if
-
-  if(context.thread_index() < n1)
-  {
-    // scatter each element from input1
-    RandomAccessIterator1 src = first1 + context.thread_index();
-    RandomAccessIterator3 dst = result + context.thread_index() + rank2;
-
-    *dst = *src;
-  }
-
-  if(context.thread_index() < n2)
-  {
-    // scatter each element from input2
-    RandomAccessIterator2 src = first2 + context.thread_index();
-    RandomAccessIterator3 dst = result + context.thread_index() + rank1;
-
-    *dst = *src;
-  }
-
-  return result + n1 + n2;
-} // end merge
-
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Size1,
-         typename Size2,
-         typename StrictWeakOrdering>
-__device__ __thrust_forceinline__
-  void inplace_merge_by_key_n(Context context,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator2 values_first,
-                              Size1 n1,
-                              Size2 n2,
-                              StrictWeakOrdering comp)
-{
-  RandomAccessIterator1 input1 = keys_first;
-  RandomAccessIterator1 input2 = keys_first + n1;
-
-  RandomAccessIterator2 input1val = values_first;
-  RandomAccessIterator2 input2val = values_first + n1;
-  
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type KeyType;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
-
-  // XXX use uninitialized here
-  KeyType inp1 = input1[context.thread_index()]; ValueType inp1val = input1val[context.thread_index()];
-  KeyType inp2 = input2[context.thread_index()]; ValueType inp2val = input2val[context.thread_index()];
-  
-  // to merge input1 and input2, use binary search to find the rank of inp1 & inp2 in arrays input2 & input1, respectively
-  // as before, the "end" variables point to one element after the last element of the arrays
-  
-  // start by looking through input2 for inp1's rank
-  unsigned int start_1 = 0;
-  
-  // don't do the search if our value is beyond the end of input1
-  if(context.thread_index() < n1)
-  {
-    start_1 = thrust::system::detail::generic::scalar::lower_bound_n(input2, n2, inp1, comp) - input2;
-  } // end if
-  
-  // now look through input1 for inp2's rank
-  unsigned int start_2 = 0;
-  
-  // don't do the search if our value is beyond the end of input2
-  if(context.thread_index() < n2)
-  {
-    // upper_bound ensures that equivalent elements in the first range sort before the second
-    start_2 = thrust::system::detail::generic::scalar::upper_bound_n(input1, n1, inp2, comp) - input1;
-  } // end if
-
-  context.barrier();
-  
-  // Write back into the right position to the input arrays; can be done in place since we read in
-  // the input arrays into registers before.
-  if(context.thread_index() < n1)
-  {
-    input1[start_1 + context.thread_index()] = inp1;
-    input1val[start_1 + context.thread_index()] = inp1val;
-  } // end if
-  
-  if(context.thread_index() < n2)
-  {
-    input1[start_2 + context.thread_index()] = inp2;
-    input1val[start_2 + context.thread_index()] = inp2val;
-  } // end if
-} // end inplace_merge_by_key_n()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/merging_sort.h b/thrust/system/cuda/detail/block/merging_sort.h
deleted file mode 100644
index 5f8eed6a6..000000000
--- a/thrust/system/cuda/detail/block/merging_sort.h
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file merging_sort.h
- *  \brief Block version of merge sort
- */
-
-#pragma once
-
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/detail/generic/scalar/binary_search.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
-__device__ void conditional_swap(RandomAccessIterator1 keys_first,
-                                 RandomAccessIterator2 values_first,
-                                 const unsigned int i,
-                                 const unsigned int end,
-                                 bool pred,
-                                 Compare comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
-
-  if(pred && i+1<end)
-  {
-    KeyType xi = keys_first[i];
-    KeyType xj = keys_first[i+1];
-
-    // swap if xj sorts before xi
-    if(comp(xj, xi))
-    {
-      // XXX this implementation should really dispatch swap via ADL
-      ValueType yi;
-      yi = values_first[i];
-      ValueType yj;
-      yj = values_first[i+1];
-
-      keys_first[i]     = xj;
-      keys_first[i+1]   = xi;
-      values_first[i]   = yj;
-      values_first[i+1] = yi;
-    }
-  }
-}
-
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__device__ void transposition_sort(Context context,
-                                   RandomAccessIterator1 keys_first,
-                                   RandomAccessIterator2 values_first,
-                                   const unsigned int i,
-                                   const unsigned int end,
-                                   const unsigned int size,
-                                   Compare comp)
-{
-  const bool is_odd = i&0x1;
-  
-  for(unsigned int round=size/2; round>0; --round)
-  {
-    // ODDS
-    conditional_swap(keys_first, values_first, i, end, is_odd, comp);
-    context.barrier();
-  
-    // EVENS
-    conditional_swap(keys_first, values_first, i, end, !is_odd, comp);
-    context.barrier();
-  }
-}
-
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__device__ void merge(Context context,
-                      RandomAccessIterator1 keys_first, 
-                      RandomAccessIterator2 values_first,
-                      const unsigned int i,
-                      const unsigned int n,
-                      unsigned int begin,
-                      unsigned int end,
-                      unsigned int h,
-                      StrictWeakOrdering cmp)
-{
-  // INVARIANT: Every element i resides within a sequence [begin,end)
-  //            of length h which is already sorted
-  while( h<n )
-  {
-    h *= 2;
-
-    unsigned int new_begin = i&(~(h-1));
-    unsigned int new_end   = min(n,new_begin+h);
-
-    typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
-    typedef typename thrust::iterator_traits<RandomAccessIterator2>::value_type ValueType;
-
-    KeyType key;
-    ValueType value;
-
-    unsigned int rank = i - begin;
-
-    // prevent out-of-bounds access
-    if(i < new_end)
-    {
-      key = keys_first[i];
-
-      if(begin==new_begin)  // in the left side of merging pair
-      {
-        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::lower_bound_n(keys_first+end, new_end-end, key, cmp);
-        rank += (result - (keys_first+end));
-      }
-      else                  // in the right side of merging pair
-      {
-        RandomAccessIterator1 result = thrust::system::detail::generic::scalar::upper_bound_n(keys_first+new_begin, begin-new_begin, key, cmp);
-        rank += (result - (keys_first+new_begin));
-      }
-
-      value = values_first[i];
-    }
-
-    context.barrier();
-
-    if(i < new_end)
-    {
-      keys_first[new_begin+rank] = key;
-      values_first[new_begin+rank] = value;
-    }
-    
-    context.barrier();
-
-    begin = new_begin;
-    end   = new_end;
-  }
-}
-
-
-/*! Block-wise implementation of merge sort.
- *  It provides the same external interface as odd_even_sort.
- */
-template<typename Context,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__device__ void merging_sort(Context context,
-                             RandomAccessIterator1 keys_first,
-                             RandomAccessIterator2 values_first,
-                             const unsigned int n,
-                             StrictWeakOrdering comp)
-{
-  // Phase 1: Sort subsequences of length 32 using odd-even
-  //          transposition sort.  The code below assumes that h is a
-  //          power of 2.  Empirically, 32 delivers best results,
-  //          which is not surprising since that's the warp width.
-  unsigned int i = context.thread_index();
-  unsigned int h = 32;
-  unsigned int begin=i&(~(h-1)),  end=min(n,begin+h);
-  
-  transposition_sort(context, keys_first, values_first, i, end, h, comp);
-  
-  // Phase 2: Apply merge tree to produce final sorted results
-  merge(context, keys_first, values_first, i, n, begin, end, h, comp);
-} // end merging_sort()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/odd_even_sort.h b/thrust/system/cuda/detail/block/odd_even_sort.h
deleted file mode 100644
index d32c0f36a..000000000
--- a/thrust/system/cuda/detail/block/odd_even_sort.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file odd_even_sort.h
- *  \brief Block versions of Batcher's Odd-Even Merge Sort
- */
-
-#pragma once
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-
-/*! Block-wise implementation of Batcher's Odd-Even Merge Sort
- *  This implementation is based on Nadathur Satish's.
- */
-template<typename KeyType,
-         typename ValueType,
-         typename StrictWeakOrdering>
-  __device__ void odd_even_sort(KeyType *keys,
-                                ValueType *data,
-                                const unsigned int n,
-                                StrictWeakOrdering comp)
-{
-  for(unsigned int p = blockDim.x>>1; p > 0; p >>= 1)
-  {
-    unsigned int q = blockDim.x>>1, r = 0, d = p;
-
-    while(q >= p)
-    {
-      unsigned int j = threadIdx.x + d;
-
-      // if j lies beyond the end of the array, we consider it "sorted" wrt i
-      // regardless of whether i lies beyond the end of the array 
-      if(threadIdx.x < (blockDim.x-d) && (threadIdx.x & p) == r && j < n)
-      {
-        KeyType xikey = keys[threadIdx.x];
-        KeyType xjkey = keys[j];
-
-        ValueType xivalue = data[threadIdx.x];
-        ValueType xjvalue = data[j];
-
-        // does xj sort before xi?
-        if(comp(xjkey, xikey))
-        {
-          keys[threadIdx.x] = xjkey;
-          keys[j] = xikey;
-
-          data[threadIdx.x] = xjvalue;
-          data[j] = xivalue;
-        } // end if
-      } // end if
-
-      d = q - p;
-      q >>= 1;
-      r = p;
-
-      __syncthreads();
-    } // end while
-  } // end for p
-} // end odd_even_sort()
-
-template<typename KeyType,
-         typename ValueType,
-         typename StrictWeakOrdering>
-  __device__ void stable_odd_even_sort(KeyType *keys,
-                                       ValueType *data,
-                                       const unsigned int n,
-                                       StrictWeakOrdering comp)
-{
-  for(unsigned int i = 0;
-      i < blockDim.x>>1;
-      ++i)
-  {
-    bool thread_is_odd = threadIdx.x & 0x1;
-
-    // do odds first
-    if(thread_is_odd && threadIdx.x + 1 < n)
-    {
-      KeyType xikey = keys[threadIdx.x];
-      KeyType xjkey = keys[threadIdx.x + 1];
-
-      ValueType xivalue = data[threadIdx.x];
-      ValueType xjvalue = data[threadIdx.x + 1];
-
-      // does xj sort before xi?
-      if(comp(xjkey, xikey))
-      {
-        keys[threadIdx.x] = xjkey;
-        keys[threadIdx.x + 1] = xikey;
-
-        data[threadIdx.x] = xjvalue;
-        data[threadIdx.x + 1] = xivalue;
-      } // end if
-    } // end if
-
-    __syncthreads();
-
-    // do evens second
-    if(!thread_is_odd && threadIdx.x + 1 < n)
-    {
-      KeyType xikey = keys[threadIdx.x];
-      KeyType xjkey = keys[threadIdx.x + 1];
-
-      ValueType xivalue = data[threadIdx.x];
-      ValueType xjvalue = data[threadIdx.x + 1];
-
-      // does xj sort before xi?
-      if(comp(xjkey, xikey))
-      {
-        keys[threadIdx.x] = xjkey;
-        keys[threadIdx.x + 1] = xikey;
-
-        data[threadIdx.x] = xjvalue;
-        data[threadIdx.x + 1] = xivalue;
-      } // end if
-    } // end if
-
-    __syncthreads();
-  } // end for i
-} // end stable_odd_even_sort()
-
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/block/reduce.h b/thrust/system/cuda/detail/block/reduce.h
deleted file mode 100644
index 654779336..000000000
--- a/thrust/system/cuda/detail/block/reduce.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace block
-{
-
-/* Reduces [data, data + n) using binary_op and stores the result in data[0]
- *
- * Upon return the elements in [data + 1, data + n) have unspecified values.
- */
-template <typename Context, typename ValueIterator, typename BinaryFunction>
-__device__ __thrust_forceinline__
-void reduce_n(Context context, ValueIterator data, unsigned int n, BinaryFunction binary_op)
-{
-  if (context.block_dimension() < n)
-  {
-    for (unsigned int i = context.block_dimension() + context.thread_index(); i < n; i += context.block_dimension())
-      data[context.thread_index()] = binary_op(data[context.thread_index()], data[i]);
-
-    context.barrier();
-  }
-
-  while (n > 1)
-  {
-    unsigned int half = n / 2;
-
-    if (context.thread_index() < half)
-      data[context.thread_index()] = binary_op(data[context.thread_index()], data[n - context.thread_index() - 1]);
-
-    context.barrier();
-
-    n = n - half;
-  }
-}
-
-} // end namespace block
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/bulk.h b/thrust/system/cuda/detail/bulk.h
deleted file mode 100644
index cfbbcf033..000000000
--- a/thrust/system/cuda/detail/bulk.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// we need to carefully undefine and then redefined these macros to ensure that multiple
-// versions of bulk can coexist in the same program
-// push_macro & pop_macro were introduced to gcc in version 4.3
-
-// if the macros are already defined, save them and undefine them
-
-#if !defined(__GNUC__) || (THRUST_GCC_VERSION >= 40300)
-#  ifdef BULK_NAMESPACE_PREFIX
-#    pragma push_macro("BULK_NAMESPACE_PREFIX")
-#    undef BULK_NAMESPACE_PREFIX
-#    define BULK_NAMESPACE_PREFIX_NEEDS_RESTORE
-#  endif
-#  ifdef BULK_NAMESPACE_SUFFIX
-#    pragma push_macro("BULK_NAMESPACE_SUFFIX")
-#    undef BULK_NAMESPACE_SUFFIX
-#    define BULK_NAMESPACE_SUFFIX_NEEDS_RESTORE
-#  endif
-#endif // __GNUC__
-
-// define the macros while we #include our version of bulk
-#define BULK_NAMESPACE_PREFIX namespace thrust { namespace system { namespace cuda { namespace detail {
-#define BULK_NAMESPACE_SUFFIX                  }                  }                }                  }
-
-// rename "bulk" so it doesn't collide with another installation elsewhere
-#define bulk bulk_
-
-#include <thrust/system/cuda/detail/bulk/bulk.hpp>
-
-// undef the top-level namespace name
-#undef bulk
-
-// undef the macros
-#undef BULK_NAMESPACE_PREFIX
-#undef BULK_NAMESPACE_SUFFIX
-
-// redefine the macros if they were defined previously
-
-#if !defined(__GNUC__) || (THRUST_GCC_VERSION >= 40300)
-#  ifdef BULK_NAMESPACE_PREFIX_NEEDS_RESTORE
-#    pragma pop_macro("BULK_NAMESPACE_PREFIX")
-#    undef BULK_NAMESPACE_PREFIX_NEEDS_RESTORE
-#  endif
-#  ifdef BULK_NAMESPACE_SUFFIX_NEEDS_RESTORE
-#    pragma pop_macro("BULK_NAMESPACE_SUFFIX")
-#    undef BULK_NAMESPACE_SUFFIX_NEEDS_RESTORE
-#  endif
-#endif // __GNUC__
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm.hpp b/thrust/system/cuda/detail/bulk/algorithm.hpp
deleted file mode 100644
index d69abc990..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp> 
-#include <thrust/system/cuda/detail/bulk/algorithm/reduce.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/scan.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/merge.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/scatter.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/sort.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/gather.hpp>
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp b/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp
deleted file mode 100644
index 817ec0e1e..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/reduce.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-#include <thrust/detail/type_traits/function_traits.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator,
-         typename T,
-         typename BinaryFunction>
-__forceinline__ __device__
-T accumulate(const bounded<bound,bulk::agent<grainsize> > &exec,
-             RandomAccessIterator first,
-             RandomAccessIterator last,
-             T init,
-             BinaryFunction binary_op)
-{
-  typedef typename bounded<bound,bulk::agent<grainsize> >::size_type size_type;
-
-  size_type n = last - first;
-
-  for(size_type i = 0; i < exec.bound(); ++i)
-  {
-    if(i < n)
-    {
-      init = binary_op(init, first[i]);
-    } // end if
-  } // end for i
-
-  return init;
-} // end accumulate()
-
-
-namespace detail
-{
-namespace accumulate_detail
-{
-
-
-// XXX this implementation is simply an inplace inclusive scan
-//     we could potentially do better with an implementation which uses Sean's bitfield reverse trick
-template<typename ConcurrentGroup, typename RandomAccessIterator, typename Size, typename T, typename BinaryFunction>
-__device__ T destructive_accumulate_n(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op)
-{
-  typedef typename ConcurrentGroup::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-
-  T x = init;
-  if(tid < n)
-  {
-    x = first[tid];
-  }
-
-  g.wait();
-
-  for(size_type offset = 1; offset < g.size(); offset += offset)
-  {
-    if(tid >= offset && tid - offset < n)
-    {
-      x = binary_op(first[tid - offset], x);
-    }
-
-    g.wait();
-
-    if(tid < n)
-    {
-      first[tid] = x;
-    }
-
-    g.wait();
-  }
-
-  T result = binary_op(init, first[n - 1]);
-
-  g.wait();
-
-  return result;
-}
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator, typename T>
-struct buffer
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  union
-  {
-    uninitialized_array<value_type, groupsize * grainsize> inputs;
-    uninitialized_array<T, groupsize>                      sums;
-  }; // end union
-}; // end buffer
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator, typename T, typename BinaryFunction>
-__device__
-T accumulate(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-             RandomAccessIterator first,
-             RandomAccessIterator last,
-             T init,
-             BinaryFunction binary_op)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  const size_type elements_per_group = groupsize * grainsize;
-
-  size_type tid = g.this_exec.index();
-
-  T sum = init;
-
-  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
-
-  typedef detail::accumulate_detail::buffer<
-    groupsize,
-    grainsize,
-    RandomAccessIterator,
-    T
-  > buffer_type;
-
-#if __CUDA_ARCH__ >= 200
-  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
-#else
-  __shared__ uninitialized<buffer_type> buffer_impl;
-  buffer_type *buffer = &buffer_impl.get();
-#endif
-  
-  for(; first < last; first += elements_per_group)
-  {
-    // XXX each iteration is essentially a bounded accumulate
-    
-    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
-    
-    // copy partition into smem
-    bulk::copy_n(g, first, partition_size, buffer->inputs.data());
-    
-    T this_sum;
-    size_type local_offset = grainsize * g.this_exec.index();
-
-    size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, partition_size - grainsize * tid));
-
-    if(local_size)
-    {
-      this_sum = buffer->inputs[local_offset];
-      this_sum = bulk::accumulate(bound<grainsize-1>(g.this_exec),
-                                  buffer->inputs.data() + local_offset + 1,
-                                  buffer->inputs.data() + local_offset + local_size,
-                                  this_sum,
-                                  binary_op);
-    } // end if
-
-    g.wait();
-
-    if(local_size)
-    {
-      buffer->sums[tid] = this_sum;
-    } // end if
-
-    g.wait();
-    
-    // sum over the group
-    sum = accumulate_detail::destructive_accumulate_n(g, buffer->sums.data(), thrust::min<size_type>(groupsize,n), sum, binary_op);
-  } // end for
-
-#if __CUDA_ARCH__ >= 200
-  bulk::free(g, buffer);
-#endif
-
-  return sum;
-} // end accumulate
-} // end accumulate_detail
-} // end detail
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator, typename T, typename BinaryFunction>
-__device__
-T accumulate(bulk::concurrent_group<bulk::agent<grainsize>, groupsize> &g,
-             RandomAccessIterator first,
-             RandomAccessIterator last,
-             T init,
-             BinaryFunction binary_op)
-{
-  // use reduce when the operator is commutative
-  if(thrust::detail::is_commutative<BinaryFunction>::value)
-  {
-    init = bulk::reduce(g, first, last, init, binary_op);
-  } // end if
-  else
-  {
-    init = detail::accumulate_detail::accumulate(g, first, last, init, binary_op);
-  } // end else
-
-  return init;
-} // end accumulate()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp b/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp
deleted file mode 100644
index ced30b958..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/adjacent_difference.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryOperation>
-__device__
-RandomAccessIterator2 adjacent_difference(bulk::agent<grainsize> &exec,
-                                          RandomAccessIterator1 first, RandomAccessIterator1 last,
-                                          RandomAccessIterator2 result,
-                                          T init,
-                                          BinaryOperation binary_op)
-{
-  for(; first != last; ++first, ++result)
-  {
-    T temp = *first;
-    *result = binary_op(temp, init);
-    init = temp;
-  } // end result
-
-  return result;
-} // end adjacent_difference()
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize_,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryOperation>
-__device__
-RandomAccessIterator2 adjacent_difference(bulk::concurrent_group<bulk::agent<grainsize_>,groupsize> &g,
-                                          RandomAccessIterator1 first, RandomAccessIterator1 last,
-                                          RandomAccessIterator2 result,
-                                          T init,
-                                          BinaryOperation binary_op)
-{
-  // XXX this implementation allows first to be equal to result
-  //     when the input and output do not overlap, we can avoid the need for next_init
-  //     and the barriers
-  
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize_>,groupsize>::size_type size_type;
-
-  RandomAccessIterator2 return_me = result + (last - first);
-
-  const size_type grainsize = g.this_exec.grainsize();
-  const size_type tile_size = g.size() * grainsize;
-
-  // set the first iteration's init
-  RandomAccessIterator1 first_init = first + grainsize * g.this_exec.index() - 1;
-  if(first <= first_init && first_init < last)
-  {
-    init = *first_init;
-  }
-  
-  g.wait();
-
-  for(; first < last; first += tile_size, result += tile_size)
-  {
-    size_type local_offset = grainsize * g.this_exec.index();
-    size_type local_size = thrust::max(0, thrust::min<size_type>(grainsize, last - (first + local_offset)));
-
-    // get the init for the next iteration
-    T next_init = (first + local_offset + tile_size - 1 < last) ? first[tile_size-1] : init;
-
-    g.wait();
-
-    // consume grainsize elements
-    bulk::adjacent_difference(g.this_exec,
-                              first + local_offset,
-                              first + local_offset + local_size,
-                              result + local_offset,
-                              init,
-                              binary_op);
-
-    init = next_init;
-  }
-
-  g.wait();
-
-  return return_me;
-} // end adjacent_difference()
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename BinaryOperation>
-__device__
-RandomAccessIterator2 adjacent_difference(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-                                          RandomAccessIterator1 first, RandomAccessIterator1 last,
-                                          RandomAccessIterator2 result,
-                                          BinaryOperation binary_op)
-{
-  if(first < last)
-  {
-    typename thrust::iterator_value<RandomAccessIterator1>::type init = *first;
-
-    // we need to wait because first may be the same as result
-    g.wait();
-
-    if(g.this_exec.index() == 0)
-    {
-      *result = init;
-    }
-
-    result = bulk::adjacent_difference(g, first + 1, last, result + 1, init, binary_op); 
-  } // end if
-
-  return result;
-} // end adjacent_difference()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/copy.hpp b/thrust/system/cuda/detail/bulk/algorithm/copy.hpp
deleted file mode 100644
index 4c24f801c..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/copy.hpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/is_contiguous_iterator.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp>
-#include <thrust/detail/type_traits.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__forceinline__ __device__
-RandomAccessIterator2 copy_n(const bounded<bound,agent<grainsize> > &b,
-                             RandomAccessIterator1 first,
-                             Size n,
-                             RandomAccessIterator2 result)
-{
-  typedef typename bounded<bound,agent<grainsize> >::size_type size_type;
-
-  if(bound <= n)
-  {
-    for(size_type i = 0; i < b.bound(); ++i, ++result, ++first)
-    {
-      *result = *first;
-    } // end for i
-  } // end if
-  else
-  {
-    for(size_type i = 0; i < b.bound(); ++i, ++first)
-    {
-      if(i < n)
-      {
-        *result = *first;
-        ++result;
-      } // end if
-    } // end for i
-  } // end else
-
-  return result;
-} // end copy_n()
-
-
-
-namespace detail
-{
-
-
-template<typename ConcurrentGroup,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__forceinline__ __device__
-RandomAccessIterator2 simple_copy_n(ConcurrentGroup &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  for(Size i = g.this_exec.index();
-      i < n;
-      i += g.size())
-  {
-    result[i] = first[i];
-  } // end for i
-
-  g.wait();
-
-  return result + n;
-} // end simple_copy_n()
-
-
-template<std::size_t size,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__forceinline__ __device__
-typename thrust::detail::enable_if<
-  (size * grainsize > 0),
-  RandomAccessIterator2
->::type
-  simple_copy_n(bulk::concurrent_group<
-                  agent<grainsize>,
-                  size
-                > &g,
-                RandomAccessIterator1 first, Size n,
-                RandomAccessIterator2 result)
-{
-  typedef bulk::concurrent_group<
-    agent<grainsize>,
-    size
-  > group_type;
-
-  RandomAccessIterator2 return_me = result + n;
-
-  typedef typename group_type::size_type size_type;
-  size_type chunk_size = size * grainsize;
-
-  size_type tid = g.this_exec.index();
-
-  // important special case which avoids the expensive for loop below
-  if(chunk_size == n)
-  {
-    // offset iterators by tid before loop
-    first += tid;
-    result += tid;
-
-    for(size_type i = 0; i < grainsize; ++i, first += size, result += size)
-    {
-      *result = *first;
-    } // end for
-  } // end if
-  else
-  {
-    // XXX i have a feeling the indexing could be rewritten to require less arithmetic
-    for(RandomAccessIterator1 last = first + n;
-        first < last;
-        first += chunk_size, result += chunk_size)
-    {
-      // avoid conditional accesses when possible
-      if((last - first) >= chunk_size)
-      {
-        for(size_type i = 0; i < grainsize; ++i)
-        {
-          size_type idx = size * i + tid;
-          result[idx] = first[idx];
-        } // end for
-      } // end if
-      else
-      {
-        for(size_type i = 0; i < grainsize; ++i)
-        {
-          size_type idx = size * i + tid;
-          if(idx < (last - first))
-          {
-            result[idx] = first[idx];
-          } // end if
-        } // end for
-      } // end else
-    } // end for
-  } // end else
-
-  g.wait();
-
-  return return_me;
-} // end simple_copy_n()
-
-
-template<std::size_t size,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__forceinline__ __device__
-RandomAccessIterator2 copy_n(concurrent_group<
-                               agent<grainsize>,
-                               size
-                             > &g,
-                             RandomAccessIterator1 first,
-                             Size n,
-                             RandomAccessIterator2 result)
-{
-  return detail::simple_copy_n(g, first, n, result);
-} // end copy_n()
-
-
-} // end detail
-
-
-template<std::size_t groupsize,
-         typename Executor,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__forceinline__ __device__
-RandomAccessIterator2
-  copy_n(bulk::concurrent_group<Executor,groupsize> &g, RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  return detail::copy_n(g, first, n, result);
-} // end copy_n()
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-__device__
-typename thrust::detail::enable_if<
-  (bound <= groupsize * grainsize),
-  RandomAccessIterator2 
->::type
-copy_n(bulk::bounded<
-         bound,
-         concurrent_group<
-           agent<grainsize>,
-           groupsize
-         >
-       > &g,
-       RandomAccessIterator1 first,
-       Size n,
-       RandomAccessIterator2 result)
-{
-  typedef bounded<
-    bound,
-    concurrent_group<
-      agent<grainsize>,
-      groupsize
-    >
-  > group_type;
-
-  typedef typename group_type::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-  // XXX make this an uninitialized array
-  value_type stage[grainsize];
-
-  // avoid conditional accesses when possible
-  if(groupsize * grainsize <= n)
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type src_idx = g.size() * i + tid;
-      stage[i] = first[src_idx];
-    } // end for i
-
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type dst_idx = g.size() * i + tid;
-      result[dst_idx] = stage[i];
-    } // end for i
-  } // end if
-  else
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type src_idx = g.size() * i + tid;
-      if(src_idx < n)
-      {
-        stage[i] = first[src_idx];
-      } // end if
-    } // end for
-
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type dst_idx = g.size() * i + tid;
-      if(dst_idx < n)
-      {
-        result[dst_idx] = stage[i];
-      } // end if
-    } // end for
-  } // end else
-
-  g.wait();
-
-  return result + thrust::min<Size>(g.size() * grainsize, n);
-} // end copy_n()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp b/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp
deleted file mode 100644
index 8ca22bf1b..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/gather.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/merge.hpp>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/iterator/counting_iterator.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-// XXX forward declaration for inplace_merge_adjacent_partitions below
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__forceinline__ __device__
-void stable_sort_by_key(const bounded<bound,agent<grainsize> > &exec,
-                        RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        Compare comp);
-
-
-namespace detail
-{
-namespace stable_merge_sort_detail
-{
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize, typename KeyType, typename ValType, typename Compare>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize
->::type
-inplace_merge_adjacent_partitions(bulk::bounded<bound,bulk::concurrent_group<bulk::agent<grainsize>, groupsize> > &g,
-                                  KeyType local_keys[grainsize], ValType local_values[grainsize], void* stage_ptr, int count, int local_size, Compare comp)
-{
-  union stage_t
-  {
-    KeyType *keys;
-    ValType *vals;
-  };
-  
-  stage_t stage;
-  stage.keys = reinterpret_cast<KeyType*>(stage_ptr);
-
-  typedef typename bulk::agent<grainsize>::size_type size_type;
-
-  size_type local_offset = grainsize * g.this_exec.index();
-
-  // XXX this loop seems to assume that groupsize is a power of two
-  //     NPOT groupsize crashes merge sort
-  for(size_type num_agents_per_merge = 2; num_agents_per_merge <= groupsize; num_agents_per_merge *= 2)
-  {
-    // copy keys into the stage so we can dynamically index them
-    bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_keys, local_size, stage.keys + local_offset);
-
-    g.wait();
-
-    // find the index of the first array this agent will merge
-    size_type list = ~(num_agents_per_merge - 1) & g.this_exec.index();
-    size_type diag = thrust::min<size_type>(count, grainsize * ((num_agents_per_merge - 1) & g.this_exec.index()));
-    size_type start = grainsize * list;
-
-    // the size of each of the two input arrays we're merging
-    size_type input_size = grainsize * (num_agents_per_merge / 2);
-
-    size_type partition_first1 = thrust::min<size_type>(count, start);
-    size_type partition_first2 = thrust::min<size_type>(count, partition_first1 + input_size);
-    size_type partition_last2  = thrust::min<size_type>(count, partition_first2 + input_size);
-
-    size_type n1 = partition_first2 - partition_first1;
-    size_type n2 = partition_last2  - partition_first2;
-
-    size_type mp = bulk::merge_path(stage.keys + partition_first1, n1, stage.keys + partition_first2, n2, diag, comp);
-
-    // each agent merges sequentially locally
-    // note the source index of each merged value so that we can gather values into merged order later
-    size_type gather_indices[grainsize];
-    bulk::merge_by_key(bulk::bound<grainsize>(g.this_exec),
-                       stage.keys + partition_first1 + mp,        stage.keys + partition_first2,
-                       stage.keys + partition_first2 + diag - mp, stage.keys + partition_last2,
-                       thrust::make_counting_iterator<size_type>(partition_first1 + mp),
-                       thrust::make_counting_iterator<size_type>(partition_first2 + diag - mp),
-                       local_keys,
-                       gather_indices,
-                       comp);
-    
-    // move values into the stage so we can index them
-    bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_values, local_size, stage.vals + local_offset);
-
-    // gather values into registers
-    bulk::gather(bulk::bound<grainsize>(g.this_exec), gather_indices, gather_indices + local_size, stage.vals, local_values);
-
-    g.wait();
-  } // end for
-} // end inplace_merge_adjacent_partitions()
-
-
-} // end stable_merge_sort_detail
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize
->::type
-stable_merge_sort_by_key(bulk::bounded<bound,bulk::concurrent_group<bulk::agent<grainsize>,groupsize> > &g,
-                         RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                         RandomAccessIterator2 values_first,
-                         Compare comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type;
-
-  typedef typename bulk::agent<grainsize>::size_type size_type;
-
-  size_type n = keys_last - keys_first;
-  const size_type tile_size = groupsize * grainsize;
-
-  size_type local_offset = grainsize * g.this_exec.index();
-  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n - local_offset));
-
-#if __CUDA_ARCH__ >= 200
-  union
-  {
-    key_type   *keys;
-    value_type *values;
-  } stage;
-
-  stage.keys = static_cast<key_type*>(bulk::malloc(g, tile_size * thrust::max(sizeof(key_type), sizeof(value_type))));
-#else
-  __shared__ union
-  {
-    key_type   keys[tile_size];
-    value_type values[tile_size];
-  } stage;
-#endif
-  
-  // load each agent's keys into registers
-  bulk::copy_n(bulk::bound<tile_size>(g), keys_first, n, stage.keys);
-
-  key_type local_keys[grainsize];
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), stage.keys + local_offset, local_size, local_keys);
-
-  // load each agent's values into registers
-  bulk::copy_n(bulk::bound<tile_size>(g), values_first, n, stage.values);
-
-  value_type local_values[grainsize];
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), stage.values + local_offset, local_size, local_values);
-
-  // each agent sorts its local partition of the array
-  bulk::stable_sort_by_key(bulk::bound<grainsize>(g.this_exec), local_keys, local_keys + local_size, local_values, comp);
-  
-  // merge adjacent partitions together
-  // avoid dynamic sizes when possible
-  if(n == tile_size)
-  {
-    stable_merge_sort_detail::inplace_merge_adjacent_partitions(g, local_keys, local_values, stage.keys, tile_size, grainsize, comp);
-  } // end if
-  else
-  {
-    stable_merge_sort_detail::inplace_merge_adjacent_partitions(g, local_keys, local_values, stage.keys, n, local_size, comp);
-  } // end else
-
-  // store the sorted keys back to the input
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_keys, local_size, stage.keys + local_offset);
-  g.wait();
-
-  bulk::copy_n(bulk::bound<tile_size>(g), stage.keys, n, keys_first);
-  
-  // store the sorted values back to the input
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_values, local_size, stage.values + local_offset);
-  g.wait();
-
-  bulk::copy_n(bulk::bound<tile_size>(g), stage.values, n, values_first);
-
-#if __CUDA_ARCH__ >= 200
-  bulk::free(g, stage.keys);
-#endif
-} // end stable_merge_sort_by_key()
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp b/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp
deleted file mode 100644
index 9758054ec..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/for_each.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename ExecutionGroup,
-         typename RandomAccessIterator,
-         typename Size,
-         typename Function>
-__device__
-RandomAccessIterator for_each_n(ExecutionGroup &g, RandomAccessIterator first, Size n, Function f)
-{
-  for(Size i = g.this_thread.index();
-      i < n;
-      i += g.size())
-  {
-    f(first[i]);
-  } // end for i
-
-  g.wait();
-
-  return first + n;
-} // end for_each()
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator,
-         typename Size,
-         typename Function>
-__device__
-RandomAccessIterator for_each_n(bounded<bound, bulk::agent<grainsize> > &b,
-                                RandomAccessIterator first,
-                                Size n,
-                                Function f)
-{
-  typedef typename bounded<bound, bulk::agent<grainsize> >::size_type size_type;
-
-  for(size_type i = 0; i < bound; ++i)
-  {
-    if(i < n)
-    {
-      f(first[i]);
-    } // end if
-  } // end for i
-
-  return first + n;
-} // end for_each_n()
-                                
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/gather.hpp b/thrust/system/cuda/detail/bulk/algorithm/gather.hpp
deleted file mode 100644
index 598dd9d2a..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/gather.hpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/iterator/permutation_iterator.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-// XXX eliminate me!
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3>
-__forceinline__ __device__
-RandomAccessIterator3 gather(const bounded<bound,agent<grainsize> > &,
-                             RandomAccessIterator1 map_first,
-                             RandomAccessIterator1 map_last,
-                             RandomAccessIterator2 input_first,
-                             RandomAccessIterator3 result)
-{
-  typedef typename bulk::bounded<bound,bulk::agent<grainsize> >::size_type size_type;
-
-  size_type n = map_last - map_first;
-
-  if(bound <= n)
-  {
-    for(size_type i = 0; i < bound; ++i)
-    {
-      result[i] = input_first[map_first[i]];
-    }
-  }
-  else
-  {
-    for(size_type i = 0; i < bound; ++i)
-    {
-      if(i < n)
-      {
-        result[i] = input_first[map_first[i]];
-      }
-    }
-  }
-
-  return result + n;
-} // end scatter_if()
-
-
-template<typename ExecutionGroup, typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3>
-__forceinline__ __device__
-RandomAccessIterator3 gather(ExecutionGroup &g,
-                             RandomAccessIterator1 map_first,
-                             RandomAccessIterator1 map_last,
-                             RandomAccessIterator2 input_first,
-                             RandomAccessIterator3 result)
-{
-  return bulk::copy_n(g,
-                      thrust::make_permutation_iterator(input_first, map_first),
-                      map_last - map_first,
-                      result);
-} // end gather()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/merge.hpp b/thrust/system/cuda/detail/bulk/algorithm/merge.hpp
deleted file mode 100644
index 355185e5d..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/merge.hpp
+++ /dev/null
@@ -1,612 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/gather.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/detail/join_iterator.h>
-#include <thrust/detail/minmax.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename Compare>
-__device__
-Size merge_path(RandomAccessIterator1 first1, Size n1,
-                RandomAccessIterator2 first2, Size n2,
-                Size diag,
-                Compare comp)
-{
-  Size begin = thrust::max<Size>(Size(0), diag - n2);
-  Size end = thrust::min<Size>(diag, n1);
-  
-  while(begin < end)
-  {
-    Size mid = (begin + end) >> 1;
-
-    if(comp(first2[diag - 1 - mid], first1[mid]))
-    {
-      end = mid;
-    } // end if
-    else
-    {
-      begin = mid + 1;
-    } // end else
-  } // end while
-
-  return begin;
-} // end merge_path()
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Compare>
-__device__
-OutputIterator merge(const bulk::bounded<bound,agent<grainsize> > &e,
-                     InputIterator1 first1, InputIterator1 last1,
-                     InputIterator2 first2, InputIterator2 last2,
-                     OutputIterator result,
-                     Compare comp)
-{
-  typedef typename bulk::bounded<bound,bulk::agent<grainsize> >::size_type size_type;
-
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type1;
-  typedef typename thrust::iterator_value<InputIterator2>::type value_type2;
-
-  size_type n = (last1 - first1) + (last2 - first2);
-
-  // XXX uninitialized is a speed-down in this instance
-  //bulk::uninitialized<value_type1>   key_a;
-  value_type1   key_a;
-  size_type     n1 = last1 - first1;
-  size_type     idx1 = 0;
-
-  if(n1 > 0)
-  {
-    //key_a.construct(first1[idx1]);
-    key_a = first1[idx1];
-  } // end if
-
-  //bulk::uninitialized<value_type2>   key_b;
-  value_type2   key_b;
-  size_type     n2 = last2 - first2;
-  size_type     idx2 = 0;
-
-  if(n2 > 0)
-  {
-    //key_b.construct(first2[idx2]);
-    key_b = first2[idx2];
-  } // end if
-  
-  // avoid branching when possible
-  if(bound <= n)
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      bool p = (idx2 >= n2) || ((idx1 < n1) && !comp(key_b, key_a));
-      
-      result[i] = p ? key_a : key_b;
-
-      if(p)
-      {
-        ++idx1;
-        
-        // use of min avoids conditional load
-        key_a = first1[min(idx1, n1 - 1)];
-      } // end if
-      else
-      {
-        ++idx2;
-
-        // use of min avoids conditional load
-        key_b = first2[min(idx2, n2 - 1)];
-      } // end else
-    } // end for
-  } // end if
-  else
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      if(i < n)
-      {
-        bool p = (idx2 >= n2) || ((idx1 < n1) && !comp(key_b, key_a));
-        
-        result[i] = p ? key_a : key_b;
-
-        if(p)
-        {
-          ++idx1;
-
-          // use of min avoids conditional load
-          key_a = first1[min(idx1, n1 - 1)];
-        } // end if
-        else
-        {
-          ++idx2;
-
-          // use of min avoids conditional load
-          key_b = first2[min(idx2, n2 - 1)];
-        } // end else
-      } // end if
-    } // end for
-  } // end else
-
-//  if(n1 > 0)
-//  {
-//    key_a.destroy();
-//  } // end if
-//
-//  if(n2 > 0)
-//  {
-//    key_b.destroy();
-//  } // end if
-
-  return result + n;
-} // end merge
-
-
-template<std::size_t bound, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename RandomAccessIterator5,
-         typename RandomAccessIterator6,
-         typename Compare>
-__device__
-thrust::pair<RandomAccessIterator5,RandomAccessIterator6>
-  merge_by_key(const bulk::bounded<bound,bulk::agent<grainsize> > &,
-               RandomAccessIterator1 keys_first1, RandomAccessIterator1 keys_last1,
-               RandomAccessIterator2 keys_first2, RandomAccessIterator2 keys_last2,
-               RandomAccessIterator3 values_first1,
-               RandomAccessIterator4 values_first2,
-               RandomAccessIterator5 keys_result,
-               RandomAccessIterator6 values_result,
-               Compare comp)
-{
-  typedef typename bulk::bounded<bound,bulk::agent<grainsize> >::size_type size_type;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type key_type2;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type1;
-  typedef typename thrust::iterator_value<RandomAccessIterator4>::type value_type2;
-
-  size_type n = (keys_last1 - keys_first1) + (keys_last2 - keys_first2);
-
-  // XXX uninitialized is a speed-down in this instance
-  //bulk::uninitialized<key_type1>   key_a;
-  //bulk::uninitialized<value_type1> val_a;
-  key_type1   key_a;
-  value_type1 val_a;
-  size_type   n1 = keys_last1 - keys_first1;
-  size_type   idx1 = 0;
-
-  if(n1 > 0)
-  {
-    //key_a.construct(keys_first1[idx1]);
-    //val_a.construct(values_first1[idx1]);
-    key_a = keys_first1[idx1];
-    val_a = values_first1[idx1];
-  } // end if
-
-  //bulk::uninitialized<key_type2>   key_b;
-  //bulk::uninitialized<value_type2> val_b;
-  key_type2   key_b;
-  value_type2 val_b;
-  size_type   n2 = keys_last2 - keys_first2;
-  size_type   idx2 = 0;
-
-  if(n2 > 0)
-  {
-    //key_b.construct(keys_first2[idx2]);
-    //val_b.construct(values_first2[idx2]);
-    key_b = keys_first2[idx2];
-    val_b = values_first2[idx2];
-  } // end if
-  
-  // avoid branching when possible
-  if(bound <= n)
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      bool p = (idx2 >= n2) || ((idx1 < n1) && !comp(key_b, key_a));
-      
-      keys_result[i]   = p ? key_a : key_b;
-      values_result[i] = p ? val_a : val_b;
-
-      if(p)
-      {
-        ++idx1;
-
-        // use of min avoids conditional loads
-        key_a = keys_first1[min(idx1, n1 - 1)];
-        val_a = values_first1[min(idx1, n1 - 1)];
-      } // end if
-      else
-      {
-        ++idx2;
-
-        // use of min avoids conditional loads
-        key_b = keys_first2[min(idx2, n2 - 1)];
-        val_b = values_first2[min(idx2, n2 - 1)];
-      } // end else
-    } // end for
-  } // end if
-  else
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      if(i < n)
-      {
-        bool p = (idx2 >= n2) || ((idx1 < n1) && !comp(key_b, key_a));
-        
-        keys_result[i]   = p ? key_a : key_b;
-        values_result[i] = p ? val_a : val_b;
-
-        if(p)
-        {
-          ++idx1;
-
-          // use of min avoids conditional loads
-          key_a = keys_first1[min(idx1, n1 - 1)];
-          val_a = values_first1[min(idx1, n1 - 1)];
-        } // end if
-        else
-        {
-          ++idx2;
-
-          // use of min avoids conditional loads
-          key_b = keys_first2[min(idx2, n2 - 1)];
-          val_b = values_first2[min(idx2, n2 - 1)];
-        } // end else
-      } // end if
-    } // end for
-  } // end else
-
-//  if(n1 > 0)
-//  {
-//    key_a.destroy();
-//    val_a.destroy();
-//  } // end if
-//
-//  if(n2 > 0)
-//  {
-//    key_b.destroy();
-//    val_b.destroy();
-//  } // end if
-
-  return thrust::make_pair(keys_result + n, values_result + n);
-} // end merge_by_key()
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator, typename Compare>
-__device__
-typename thrust::detail::enable_if<
-  (bound <= groupsize * grainsize)
->::type
-inplace_merge(bulk::bounded<
-                bound,
-                bulk::concurrent_group<
-                  bulk::agent<grainsize>,
-                  groupsize
-                >
-              > &g,
-              RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last,
-              Compare comp)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  size_type n1 = middle - first;
-  size_type n2 = last - middle;
-
-  // find the start of each local merge
-  size_type local_offset = grainsize * g.this_exec.index();
-
-  size_type mp = bulk::merge_path(first, n1, middle, n2, local_offset, comp);
-  
-  // do a local sequential merge
-  size_type local_offset1 = mp;
-  size_type local_offset2 = n1 + local_offset - mp;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-  value_type local_result[grainsize];
-  bulk::merge(bulk::bound<grainsize>(g.this_exec),
-              first + local_offset1, middle,
-              first + local_offset2, last,
-              local_result,
-              comp);
-
-  g.wait();
-
-  // copy local result back to source
-  // this is faster than getting the size from merge's result
-  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset));
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, first + local_offset); 
-
-  g.wait();
-} // end inplace_merge()
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename Compare>
-__device__
-typename thrust::detail::enable_if<
-  (bound <= groupsize * grainsize),
-  RandomAccessIterator3
->::type
-merge(bulk::bounded<
-        bound,
-        bulk::concurrent_group<
-          bulk::agent<grainsize>,
-          groupsize
-        >
-      > &g,
-      RandomAccessIterator1 first1, RandomAccessIterator1 last1,
-      RandomAccessIterator2 first2, RandomAccessIterator2 last2,
-      RandomAccessIterator3 result,
-      Compare comp)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  size_type n1 = last1 - first1;
-  size_type n2 = last2 - first2;
-
-  // find the start of each local merge
-  size_type local_offset = grainsize * g.this_exec.index();
-
-  size_type mp = bulk::merge_path(first1, n1, first2, n2, local_offset, comp);
-  
-  // do a local sequential merge
-  size_type local_offset1 = mp;
-  size_type local_offset2 = local_offset - mp;
-  
-  typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type;
-  value_type local_result[grainsize];
-  bulk::merge(bulk::bound<grainsize>(g.this_exec),
-              first1 + local_offset1, last1,
-              first2 + local_offset2, last2,
-              local_result,
-              comp);
-
-  // store local result
-  // this is faster than getting the size from merge's result
-  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n1 + n2 - local_offset));
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), local_result, local_size, result + local_offset); 
-
-  g.wait();
-
-  return result + thrust::min<size_type>(groupsize * grainsize, n1 + n2);
-} // end merge()
-
-
-namespace detail
-{
-namespace merge_detail
-{
-
-
-// XXX this should take a bounded
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4, typename Compare>
-__device__
-RandomAccessIterator4
-  bounded_merge_with_buffer(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &exec,
-                            RandomAccessIterator1 first1, RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2, RandomAccessIterator2 last2,
-                            RandomAccessIterator3 buffer,
-                            RandomAccessIterator4 result,
-                            Compare comp)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  size_type n1 = last1 - first1;
-  size_type n2 = last2 - first2;
-
-  // copy into the buffer
-  bulk::copy_n(bulk::bound<groupsize * grainsize>(exec),
-               thrust::detail::make_join_iterator(first1, n1, first2),
-               n1 + n2,
-               buffer);
-
-  // inplace merge in the buffer
-  bulk::inplace_merge(bulk::bound<groupsize * grainsize>(exec),
-                      buffer, buffer + n1, buffer + n1 + n2,
-                      comp);
-  
-  // copy to the result
-  // XXX this might be slightly faster with a bounded copy_n
-  return bulk::copy_n(exec, buffer, n1 + n2, result);
-} // end bounded_merge_with_buffer()
-
-
-} // end merge_detail
-} // end detail
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename RandomAccessIterator3, typename Compare>
-__device__
-RandomAccessIterator3 merge(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &exec,
-                            RandomAccessIterator1 first1, RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2, RandomAccessIterator2 last2,
-                            RandomAccessIterator3 result,
-                            Compare comp)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator3>::type value_type;
-
-  value_type *buffer = reinterpret_cast<value_type*>(bulk::malloc(exec, exec.size() * exec.grainsize() * sizeof(value_type)));
-
-  size_type chunk_size = exec.size() * exec.this_exec.grainsize();
-
-  size_type n1 = last1 - first1;
-  size_type n2 = last2 - first2;
-
-  // avoid the search & loop when possible
-  if(n1 + n2 <= chunk_size)
-  {
-    result = detail::merge_detail::bounded_merge_with_buffer(exec, first1, last1, first2, last2, buffer, result, comp);
-  } // end if
-  else
-  {
-    while((first1 < last1) || (first2 < last2))
-    {
-      size_type n1 = last1 - first1;
-      size_type n2 = last2 - first2;
-
-      size_type diag = thrust::min<size_type>(chunk_size, n1 + n2);
-
-      size_type mp = bulk::merge_path(first1, n1, first2, n2, diag, comp);
-
-      result = detail::merge_detail::bounded_merge_with_buffer(exec,
-                                                               first1, first1 + mp,
-                                                               first2, first2 + diag - mp,
-                                                               buffer,
-                                                               result,
-                                                               comp);
-
-      first1 += mp;
-      first2 += diag - mp;
-    } // end while
-  } // end else
-
-  bulk::free(exec, buffer);
-
-  return result;
-} // end merge()
-
-
-template<std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4,
-         typename RandomAccessIterator5,
-         typename RandomAccessIterator6,
-         typename Compare>
-__device__
-thrust::pair<RandomAccessIterator5,RandomAccessIterator6>
-merge_by_key(bulk::bounded<
-               groupsize*grainsize,
-               bulk::concurrent_group<bulk::agent<grainsize>, groupsize>
-             > &g,
-             RandomAccessIterator1 keys_first1, RandomAccessIterator1 keys_last1,
-             RandomAccessIterator2 keys_first2, RandomAccessIterator2 keys_last2,
-             RandomAccessIterator3 values_first1,
-             RandomAccessIterator4 values_first2,
-             RandomAccessIterator5 keys_result,
-             RandomAccessIterator6 values_result,
-             Compare comp)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator5>::type key_type;
-
-#if __CUDA_ARCH__ >= 200
-  union
-  {
-    key_type  *keys;
-    size_type *indices;
-  } stage;
-
-  stage.keys = static_cast<key_type*>(bulk::malloc(g, groupsize * grainsize * thrust::max(sizeof(key_type), sizeof(size_type))));
-#else
-  __shared__ union
-  {
-    key_type  keys[groupsize * grainsize];
-    size_type indices[groupsize * grainsize];
-  } stage;
-#endif
-
-  size_type n1 = keys_last1 - keys_first1;
-  size_type n2 = keys_last2 - keys_first2;
-  size_type  n = n1 + n2;
-  
-  // copy keys into stage
-  bulk::copy_n(g,
-               thrust::detail::make_join_iterator(keys_first1, n1, keys_first2),
-               n,
-               stage.keys);
-
-  // find the start of each agent's sequential merge
-  size_type diag = thrust::min<size_type>(n1 + n2, grainsize * g.this_exec.index());
-  size_type mp = bulk::merge_path(stage.keys, n1, stage.keys + n1, n2, diag, comp);
-  
-  // compute the ranges of the sources in the stage.
-  size_type start1 = mp;
-  size_type start2 = n1 + diag - mp;
-
-  size_type end1 = n1;
-  size_type end2 = n1 + n2;
-  
-  // each agent merges sequentially
-  key_type  results[grainsize];
-  size_type indices[grainsize];
-  bulk::merge_by_key(bulk::bound<grainsize>(g.this_exec),
-                     stage.keys + start1, stage.keys + end1,
-                     stage.keys + start2, stage.keys + end2,
-                     thrust::make_counting_iterator<size_type>(start1),
-                     thrust::make_counting_iterator<size_type>(start2),
-                     results,
-                     indices,
-                     comp);
-  g.wait();
-  
-  // each agent stores merged keys back to the stage
-  size_type local_offset = grainsize * g.this_exec.index();
-  size_type local_size = thrust::max<size_type>(0, thrust::min<size_type>(grainsize, n - local_offset));
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), results, local_size, stage.keys + local_offset);
-  g.wait();
-  
-  // store merged keys to the result
-  keys_result = bulk::copy_n(g, stage.keys, n, keys_result);
-  
-  // each agent copies the indices into the stage
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), indices, local_size, stage.indices + local_offset);
-  g.wait();
-  
-  // gather values into merged order
-  values_result = bulk::gather(g,
-                               stage.indices, stage.indices + n,
-                               thrust::detail::make_join_iterator(values_first1, n1, values_first2),
-                               values_result);
-
-#if __CUDA_ARCH__ >= 200
-  bulk::free(g, stage.keys);
-#endif
-
-  return thrust::make_pair(keys_result, values_result);
-} // end merge_by_key()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp b/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp
deleted file mode 100644
index 7f9ccaaa2..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/reduce.hpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-#include <thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/minmax.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator,
-         typename T,
-         typename BinaryFunction>
-__forceinline__ __device__
-T reduce(const bulk::bounded<bound,bulk::agent<grainsize> > &exec,
-         RandomAccessIterator first,
-         RandomAccessIterator last,
-         T init,
-         BinaryFunction binary_op)
-{
-  typedef typename bulk::bounded<bound,bulk::agent<grainsize> >::size_type size_type;
-
-  size_type n = last - first;
-
-  for(size_type i = 0; i < exec.bound(); ++i)
-  {
-    if(i < n)
-    {
-      init = binary_op(init, first[i]);
-    } // end if
-  } // end for i
-
-  return init;
-} // end reduce()
-
-
-namespace detail
-{
-namespace reduce_detail
-{
-
-
-template<typename ConcurrentGroup, typename RandomAccessIterator, typename Size, typename T, typename BinaryFunction>
-__device__ T destructive_reduce_n(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op)
-{
-  typedef int size_type;
-
-  size_type tid = g.this_exec.index();
-
-  Size m = n;
-
-  while(m > 1)
-  {
-    Size half_m = m >> 1;
-
-    if(tid < half_m)
-    {
-      T old_val = first[tid];
-
-      first[tid] = binary_op(old_val, first[m - tid - 1]);
-    } // end if
-
-    g.wait();
-
-    m -= half_m;
-  } // end while
-
-  g.wait();
-
-  T result = init;
-  if(n > 0)
-  {
-    result = binary_op(result,first[0]);
-  } // end if
-
-  g.wait();
-
-  return result;
-} // end destructive_reduce_n()
-
-
-} // end reduce_detail
-} // end detail
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator, typename T, typename BinaryFunction>
-__device__
-T reduce(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-         RandomAccessIterator first,
-         RandomAccessIterator last,
-         T init,
-         BinaryFunction binary_op)
-{
-  typedef int size_type;
-
-  const size_type elements_per_group = groupsize * grainsize;
-
-  size_type tid = g.this_exec.index();
-
-  T this_sum;
-
-  bool this_sum_defined = false;
-
-  size_type n = last - first;
-
-  // XXX we use offset as the loop counter variable instead of first
-  //     because elements_per_group can actually overflow some kinds of iterators
-  //     with small difference_types
-  for(size_type offset = 0; offset < n; first += elements_per_group, offset += elements_per_group)
-  {
-    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
-
-    typedef typename thrust::iterator_value<RandomAccessIterator>::type input_type;
-    
-    // load input into register
-    input_type local_inputs[grainsize];
-
-    // each agent strides through the input range
-    // and copies into a local array
-    strided_iterator<RandomAccessIterator,size_type> local_first = make_strided_iterator(first + tid, static_cast<size_type>(groupsize));
-
-    // XXX if we could precompute local_size for the else branch,
-    //     we could just call copy_n here
-    //     we can't precompute it (without a divide afaik), so we compute local_size in the else branch
-    size_type local_size = 0;
-    if(partition_size < elements_per_group)
-    {
-//  XXX i guess nvcc miscompiles this loop for counting_iterators
-//      size_type index = tid;
-//      for(size_type i = 0; i < grainsize; ++i, ++local_first, index += groupsize)
-//      {
-//        if(index < partition_size)
-//        {
-//          local_inputs[i] = *local_first;
-//          ++local_size;
-//        } // end if
-//      } // end for
-//
-      RandomAccessIterator iter = local_first.base();
-      size_type index = tid;
-      for(size_type i = 0; i < grainsize; ++i, index += groupsize, iter += groupsize)
-      {
-        if(index < partition_size)
-        {
-          local_inputs[i] = *iter;
-          ++local_size;
-        } // end if
-      } // end for
-    } // end if
-    else
-    {
-      local_size = grainsize;
-//  XXX nvcc 6.5 RC miscompiles this loop when RandomAccessIterator is a counting_iterator
-//      bulk::copy_n(bulk::bound<grainsize>(g.this_exec),
-//                   local_first,
-//                   local_size,
-//                   local_inputs);
-      RandomAccessIterator iter = local_first.base();
-      for(size_type i = 0; i < grainsize; ++i, iter += groupsize)
-      {
-        local_inputs[i] = *iter;
-      } // end for
-    } // end else
-
-    // reduce local_inputs sequentially
-    this_sum = this_sum_defined ?
-      bulk::reduce(bulk::bound<grainsize>(g.this_exec), local_inputs, local_inputs + local_size, this_sum, binary_op) :
-      bulk::reduce(bulk::bound<grainsize-1>(g.this_exec), local_inputs + 1, local_inputs + local_size, T(local_inputs[0]), binary_op);
-
-    this_sum_defined = true;
-  } // end for
-
-#if __CUDA_ARCH__ >= 200
-  T *buffer = reinterpret_cast<T*>(bulk::malloc(g, groupsize * sizeof(T)));
-#else
-  __shared__ bulk::uninitialized_array<T,groupsize> buffer_impl;
-  T *buffer = buffer_impl.data();
-#endif
-
-  if(this_sum_defined)
-  {
-    buffer[tid] = this_sum;
-  } // end if
-
-  g.wait();
-
-  // reduce across the group
-  T result = bulk::detail::reduce_detail::destructive_reduce_n(g, buffer, thrust::min<size_type>(groupsize,n), init, binary_op);
-
-#if __CUDA_ARCH__ >= 200
-  bulk::free(g,buffer);
-#endif
-
-  return result;
-} // end reduce
-
-
-template<typename RandomAccessIterator, typename T, typename BinaryFunction>
-__device__
-T reduce(bulk::concurrent_group<> &g,
-         RandomAccessIterator first,
-         RandomAccessIterator last,
-         T init,
-         BinaryFunction binary_op)
-{
-  typedef int size_type;
-
-  size_type tid = g.this_exec.index();
-
-  T this_sum;
-
-  bool this_sum_defined = false;
-
-  typename thrust::iterator_difference<RandomAccessIterator>::type n = last - first;
-
-  T *buffer = reinterpret_cast<T*>(bulk::malloc(g, g.size() * sizeof(T)));
-
-  for(size_type i = tid; i < n; i += g.size())
-  {
-    typedef typename thrust::iterator_value<RandomAccessIterator>::type input_type;
-    input_type x = first[i];
-    this_sum = this_sum_defined ? binary_op(this_sum, x) : x;
-
-    this_sum_defined = true;
-  }
-
-  if(this_sum_defined)
-  {
-    buffer[tid] = this_sum;
-  } // end if
-
-  g.wait();
-
-  // reduce across the block
-  T result = detail::reduce_detail::destructive_reduce_n(g, buffer, thrust::min<size_type>(g.size(),n), init, binary_op);
-
-  bulk::free(g,buffer);
-
-  return result;
-} // end reduce
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp b/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp
deleted file mode 100644
index a1f3df4de..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/reduce_by_key.hpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/scan.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/scatter.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/head_flags.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/tail_flags.hpp>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/minmax.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-namespace reduce_by_key_detail
-{
-
-
-template<typename FlagType, typename ValueType, typename BinaryFunction>
-struct scan_head_flags_functor
-{
-  BinaryFunction binary_op;
-
-  typedef thrust::tuple<FlagType,ValueType> result_type;
-  typedef result_type first_argument_type;
-  typedef result_type second_argument_type;
-
-  __host__ __device__
-  scan_head_flags_functor(BinaryFunction binary_op)
-    : binary_op(binary_op)
-  {}
-
-  __host__ __device__
-  result_type operator()(const first_argument_type &a, const second_argument_type &b)
-  {
-    ValueType val = thrust::get<0>(b) ? thrust::get<1>(b) : binary_op(thrust::get<1>(a), thrust::get<1>(b));
-    FlagType flag = thrust::get<0>(a) + thrust::get<0>(b);
-    return result_type(flag, val);
-  }
-};
-
-
-template<typename ConcurrentGroup,
-         typename InputIterator1,
-         typename Size,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename OutputIterator1,
-         typename OutputIterator2>
-__device__
-void scatter_tails_n(ConcurrentGroup &group,
-                     InputIterator1 flags_first,
-                     Size n,
-                     InputIterator2 keys_first,
-                     InputIterator3 values_first,
-                     OutputIterator1 keys_result,
-                     OutputIterator2 values_result)
-{
-  // for each tail element in [flags_first, flags_first + n)
-  // scatter the key and value to that element's corresponding flag element - 1
-  
-  // the zip_iterators in this scatter_if can confuse nvcc's pointer space tracking for __CUDA_ARCH__ < 200
-  // separate the scatters for __CUDA_ARCH__ < 200
-#if __CUDA_ARCH__ >= 200
-  bulk::scatter_if(group,
-                   thrust::make_zip_iterator(thrust::make_tuple(values_first,         keys_first)),
-                   thrust::make_zip_iterator(thrust::make_tuple(values_first + n - 1, keys_first)),
-                   thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1),
-                   bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(),
-                   thrust::make_zip_iterator(thrust::make_tuple(values_result, keys_result)));
-#else
-  bulk::scatter_if(group,
-                   values_first, 
-                   values_first + n - 1,
-                   thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1),
-                   bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(),
-                   values_result);
-
-  bulk::scatter_if(group,
-                   keys_first, 
-                   keys_first + n - 1,
-                   thrust::make_transform_iterator(flags_first, thrust::placeholders::_1 - 1),
-                   bulk::detail::make_tail_flags(flags_first, flags_first + n).begin(),
-                   keys_result);
-#endif
-} // end scatter_tails_n()
-
-
-} // end reduce_by_key_detail
-} // end detail
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename T1,
-         typename T2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-thrust::tuple<
-  OutputIterator1,
-  OutputIterator2,
-  typename thrust::iterator_value<InputIterator1>::type,
-  typename thrust::iterator_value<OutputIterator2>::type
->
-__device__
-reduce_by_key(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-              InputIterator1 keys_first, InputIterator1 keys_last,
-              InputIterator2 values_first,
-              OutputIterator1 keys_result,
-              OutputIterator2 values_result,
-              T1 init_key,
-              T2 init_value,
-              BinaryPredicate pred,
-              BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_value<InputIterator2>::type value_type; // XXX this should be the type returned by BinaryFunction
-
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  const size_type interval_size = groupsize * grainsize;
-
-#if __CUDA_ARCH__ >= 200
-  size_type *s_flags = reinterpret_cast<size_type*>(bulk::malloc(g, interval_size * sizeof(int)));
-  value_type *s_values = reinterpret_cast<value_type*>(bulk::malloc(g, interval_size * sizeof(value_type)));
-#else
-  __shared__ uninitialized_array<size_type,interval_size> s_flags_impl;
-  size_type *s_flags = s_flags_impl.data();
-
-  __shared__ uninitialized_array<value_type,interval_size> s_values_impl;
-  value_type *s_values = s_values_impl.data();
-#endif
-
-  for(; keys_first < keys_last; keys_first += interval_size, values_first += interval_size)
-  {
-    // upper bound on n is interval_size
-    size_type n = thrust::min<size_type>(interval_size, keys_last - keys_first);
-
-    bulk::detail::head_flags_with_init<
-      InputIterator1,
-      BinaryPredicate,
-      size_type
-    > flags(keys_first, keys_first + n, init_key, pred);
-
-    detail::reduce_by_key_detail::scan_head_flags_functor<size_type, value_type, BinaryFunction> f(binary_op);
-
-    // load input into smem
-    bulk::copy_n(bulk::bound<interval_size>(g),
-                 thrust::make_zip_iterator(thrust::make_tuple(flags.begin(), values_first)),
-                 n,
-                 thrust::make_zip_iterator(thrust::make_tuple(s_flags, s_values)));
-
-    // scan in smem
-    bulk::inclusive_scan(bulk::bound<interval_size>(g),
-                         thrust::make_zip_iterator(thrust::make_tuple(s_flags,     s_values)),
-                         thrust::make_zip_iterator(thrust::make_tuple(s_flags + n, s_values)),
-                         thrust::make_zip_iterator(thrust::make_tuple(s_flags,     s_values)),
-                         thrust::make_tuple(1, init_value),
-                         f);
-
-    // scatter tail results to the output
-    detail::reduce_by_key_detail::scatter_tails_n(bulk::bound<interval_size>(g),
-                                                  s_flags, n,
-                                                  keys_first, s_values,
-                                                  keys_result, values_result);
-
-
-    // if the init was not a carry, we need to insert it at the beginning of the result
-    if(g.this_exec.index() == 0 && s_flags[0] > 1)
-    {
-      keys_result[0]   = init_key;
-      values_result[0] = init_value;
-    }
-
-    size_type result_size = s_flags[n - 1] - 1;
-
-    keys_result    += result_size;
-    values_result  += result_size;
-    init_key        = keys_first[n-1];
-    init_value      = s_values[n - 1];
-
-    g.wait();
-  } // end for
-
-#if __CUDA_ARCH__ >= 200
-  bulk::free(g, s_flags);
-  bulk::free(g, s_values);
-#endif
-
-  return thrust::make_tuple(keys_result, values_result, init_key, init_value);
-} // end reduce_by_key()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/scan.hpp b/thrust/system/cuda/detail/bulk/algorithm/scan.hpp
deleted file mode 100644
index 17db99fcd..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/scan.hpp
+++ /dev/null
@@ -1,598 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/copy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/accumulate.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t bound, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename T, typename BinaryFunction>
-__forceinline__ __device__
-RandomAccessIterator2
-  inclusive_scan(const bounded<bound, bulk::agent<grainsize> > &exec,
-                 RandomAccessIterator1 first,
-                 RandomAccessIterator1 last,
-                 RandomAccessIterator2 result,
-                 T init,
-                 BinaryFunction binary_op)
-{
-  for(int i = 0; i < exec.bound(); ++i)
-  {
-    if(first + i < last)
-    {
-      init = binary_op(init, first[i]);
-      result[i] = init;
-    } // end if
-  } // end for
-
-  return result + (last - first);
-} // end inclusive_scan
-
-
-template<std::size_t bound, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename T, typename BinaryFunction>
-__forceinline__ __device__
-RandomAccessIterator2
-  exclusive_scan(const bounded<bound, bulk::agent<grainsize> > &exec,
-                 RandomAccessIterator1 first,
-                 RandomAccessIterator1 last,
-                 RandomAccessIterator2 result,
-                 T init,
-                 BinaryFunction binary_op)
-{
-  for(int i = 0; i < exec.bound(); ++i)
-  {
-    if(first + i < last)
-    {
-      result[i] = init;
-      init = binary_op(init, first[i]);
-    } // end if
-  } // end for
-
-  return result + (last - first);
-} // end exclusive_scan
-
-
-namespace detail
-{
-namespace scan_detail
-{
-
-
-template<typename InputIterator, typename OutputIterator, typename BinaryFunction>
-struct scan_intermediate
-  : thrust::detail::eval_if<
-      thrust::detail::has_result_type<BinaryFunction>::value,
-      thrust::detail::result_type<BinaryFunction>,
-      thrust::detail::eval_if<
-        thrust::detail::is_output_iterator<OutputIterator>::value,
-        thrust::iterator_value<InputIterator>,
-        thrust::iterator_value<OutputIterator>
-      >
-    >
-{};
-
-
-template<typename ConcurrentGroup, typename RandomAccessIterator, typename T, typename BinaryFunction>
-__device__ T inplace_exclusive_scan(ConcurrentGroup &g, RandomAccessIterator first, T init, BinaryFunction binary_op)
-{
-  typedef typename ConcurrentGroup::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-
-  if(tid == 0)
-  {
-    first[0] = binary_op(init, first[0]);
-  }
-
-  T x = first[tid];
-
-  g.wait();
-
-  for(size_type offset = 1; offset < g.size(); offset += offset)
-  {
-    if(tid >= offset)
-    {
-      x = binary_op(first[tid - offset], x);
-    }
-
-    g.wait();
-
-    first[tid] = x;
-
-    g.wait();
-  }
-
-  T result = first[g.size() - 1];
-
-  if(tid == 0)
-  {
-    x = init;
-  }
-  else
-  {
-    x = first[tid - 1];
-  }
-
-  g.wait();
-
-  first[tid] = x;
-
-  g.wait();
-
-  return result;
-}
-
-
-template<typename ConcurrentGroup, typename RandomAccessIterator, typename Size, typename T, typename BinaryFunction>
-__device__ T small_inplace_exclusive_scan(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op)
-{
-  typedef typename ConcurrentGroup::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-
-  if(tid == 0)
-  {
-    first[0] = binary_op(init, first[0]);
-  }
-
-  T x = tid < n ? first[tid] : init;
-
-  g.wait();
-
-  for(size_type offset = 1; offset < g.size(); offset += offset)
-  {
-    if(tid >= offset && tid - offset < n)
-    {
-      x = binary_op(first[tid - offset], x);
-    }
-
-    g.wait();
-
-    if(tid < n)
-    {
-      first[tid] = x;
-    }
-
-    g.wait();
-  }
-
-  T result = first[n - 1];
-
-  if(tid < n)
-  {
-    if(tid == 0)
-    {
-      x = init;
-    }
-    else
-    {
-      x = first[tid - 1];
-    }
-  }
-
-  g.wait();
-
-  if(tid < n)
-  {
-    first[tid] = x;
-  }
-
-  g.wait();
-
-  return result;
-}
-
-
-// the upper bound on n is g.size()
-template<typename ConcurrentGroup, typename RandomAccessIterator, typename Size, typename T, typename BinaryFunction>
-__device__ T bounded_inplace_exclusive_scan(ConcurrentGroup &g, RandomAccessIterator first, Size n, T init, BinaryFunction binary_op)
-{
-  return (n == g.size()) ?
-    inplace_exclusive_scan(g, first, init, binary_op) :
-    small_inplace_exclusive_scan(g, first, n, init, binary_op);
-}
-
-
-template<bool inclusive,
-         std::size_t bound, std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryFunction>
-__device__
-// XXX MSVC9 has trouble with this enable_if, so just don't bother with it
-//typename thrust::detail::enable_if<
-//  bound <= groupsize * grainsize,
-//  T
-//>::type
-T
-scan(bulk::bounded<
-       bound,
-       bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-     > &g,
-     RandomAccessIterator1 first, RandomAccessIterator1 last,
-     RandomAccessIterator2 result,
-     T carry_in,
-     BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type input_type;
-
-  typedef typename scan_intermediate<
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    BinaryFunction
-  >::type intermediate_type;
-  
-  typedef typename bulk::bounded<
-    bound,
-    bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-  >::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-  size_type n = last - first;
-
-  // make a local copy from the input
-  input_type local_inputs[grainsize];
-  
-  size_type local_offset = grainsize * tid;
-  size_type local_size = thrust::max<size_type>(0,thrust::min<size_type>(grainsize, n - grainsize * tid));
-  
-  bulk::copy_n(bulk::bound<grainsize>(g.this_exec), first + local_offset, local_size, local_inputs);
-  
-  // XXX this should be uninitialized<intermediate_type>
-  intermediate_type x;
-  
-  if(local_size)
-  {
-    x = local_inputs[0];
-    x = bulk::accumulate(bulk::bound<grainsize-1>(g.this_exec), local_inputs + 1, local_inputs + local_size, x, binary_op);
-  } // end if
-  
-  g.wait();
-  
-  if(local_size)
-  {
-    result[tid] = x;
-  } // end if
-  
-  g.wait();
-
-  // count the number of spine elements
-  const size_type spine_n = (n >= g.size() * g.this_exec.grainsize()) ? g.size() : (n + g.this_exec.grainsize() - 1) / g.this_exec.grainsize();
-  
-  // exclusive scan the array of per-thread sums
-  // XXX this call is another bounded scan
-  //     the bound is groupsize
-  carry_in = bounded_inplace_exclusive_scan(g, result, spine_n, carry_in, binary_op);
-  
-  if(local_size)
-  {
-    x = result[tid];
-  } // end if
-  
-  g.wait();
-  
-  if(inclusive)
-  {
-    bulk::inclusive_scan(bulk::bound<grainsize>(g.this_exec), local_inputs, local_inputs + local_size, result + local_offset, x, binary_op);
-  } // end if
-  else
-  {
-    bulk::exclusive_scan(bulk::bound<grainsize>(g.this_exec), local_inputs, local_inputs + local_size, result + local_offset, x, binary_op);
-  } // end else
-  
-  g.wait();
-
-  return carry_in;
-} // end scan()
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename BinaryFunction>
-struct scan_buffer
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type  input_type;
-
-  typedef typename scan_intermediate<
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    BinaryFunction
-  >::type intermediate_type;
-
-  union
-  {
-    uninitialized_array<input_type, groupsize * grainsize>        inputs;
-    uninitialized_array<intermediate_type, groupsize * grainsize> results;
-  };
-};
-
-
-template<bool inclusive, std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename RandomAccessIterator2, typename T, typename BinaryFunction>
-__device__ void scan_with_buffer(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-                                 RandomAccessIterator1 first, RandomAccessIterator1 last,
-                                 RandomAccessIterator2 result,
-                                 T carry_in,
-                                 BinaryFunction binary_op,
-                                 scan_buffer<groupsize,grainsize,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> &buffer)
-{
-  typedef scan_buffer<
-    groupsize,
-    grainsize,
-    RandomAccessIterator1,
-    RandomAccessIterator2,
-    BinaryFunction
-  > buffer_type;
-
-  typedef typename buffer_type::input_type        input_type;
-  typedef typename buffer_type::intermediate_type intermediate_type;
-
-  // XXX grabbing this pointer up front before the loop is noticeably
-  //     faster than dereferencing inputs or results inside buffer
-  //     in the loop below
-  union {
-    input_type        *inputs;
-    intermediate_type *results;
-  } stage;
-
-  stage.inputs = buffer.inputs.data();
-
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  size_type tid = g.this_exec.index();
-
-  const size_type elements_per_group = groupsize * grainsize;
-
-  for(; first < last; first += elements_per_group, result += elements_per_group)
-  {
-    size_type partition_size = thrust::min<size_type>(elements_per_group, last - first);
-    
-    // stage data through shared memory
-    bulk::copy_n(g, first, partition_size, stage.inputs);
-
-    carry_in = scan<inclusive>(bulk::bound<elements_per_group>(g),
-                               stage.inputs, stage.inputs + partition_size,
-                               stage.results,
-                               carry_in,
-                               binary_op);
-    
-    // copy to result 
-    bulk::copy_n(g, stage.results, partition_size, result);
-  } // end for
-} // end scan_with_buffer()
-
-
-} // end scan_detail
-} // end detail
-
-
-template<std::size_t bound,
-         std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryFunction>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize,
-  RandomAccessIterator2
->::type
-inclusive_scan(bulk::bounded<
-                 bound,
-                 bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-               > &g,
-               RandomAccessIterator1 first, RandomAccessIterator1 last,
-               RandomAccessIterator2 result,
-               T carry_in,
-               BinaryFunction binary_op)
-{
-  detail::scan_detail::scan<true>(g, first, last, result, carry_in, binary_op);
-  return result + (last - first);
-} // end inclusive_scan()
-
-
-template<std::size_t bound,
-         std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename BinaryFunction>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize,
-  RandomAccessIterator2
->::type
-inclusive_scan(bulk::bounded<
-                 bound,
-                 bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-               > &g,
-               RandomAccessIterator1 first, RandomAccessIterator1 last,
-               RandomAccessIterator2 result,
-               BinaryFunction binary_op)
-{
-  if(bound > 0 && first < last)
-  {
-    typename thrust::iterator_value<RandomAccessIterator1>::type init = *first;
-
-    // we need to wait because first may be the same as result
-    g.wait();
-
-    if(g.this_exec.index() == 0)
-    {
-      *result = init;
-    }
-
-    detail::scan_detail::scan<true>(g, first + 1, last, result + 1, init, binary_op);
-  }
-
-  return result + (last - first);
-} // end inclusive_scan()
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryFunction>
-__device__ void inclusive_scan(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-                               RandomAccessIterator1 first, RandomAccessIterator1 last,
-                               RandomAccessIterator2 result,
-                               T init,
-                               BinaryFunction binary_op)
-{
-  typedef detail::scan_detail::scan_buffer<groupsize,grainsize,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> buffer_type;
-
-#if __CUDA_ARCH__ >= 200
-  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
-
-  if(bulk::is_on_chip(buffer))
-  {
-    detail::scan_detail::scan_with_buffer<true>(g, first, last, result, init, binary_op, *bulk::on_chip_cast(buffer));
-  } // end if
-  else
-  {
-    detail::scan_detail::scan_with_buffer<true>(g, first, last, result, init, binary_op, *buffer);
-  } // end else
-
-  bulk::free(g, buffer);
-#else
-  __shared__ uninitialized<buffer_type> buffer;
-  detail::scan_detail::scan_with_buffer<true>(g, first, last, result, init, binary_op, buffer.get());
-#endif // __CUDA_ARCH__
-} // end inclusive_scan()
-
-
-template<std::size_t size,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename BinaryFunction>
-__device__
-RandomAccessIterator2
-inclusive_scan(bulk::concurrent_group<bulk::agent<grainsize>,size> &this_group,
-               RandomAccessIterator1 first,
-               RandomAccessIterator1 last,
-               RandomAccessIterator2 result,
-               BinaryFunction binary_op)
-{
-  if(first < last)
-  {
-    // the first input becomes the init
-    // XXX convert to the immediate type when passing init to respect Thrust's semantics
-    //     when Thrust adopts the semantics of N3724, just forward along *first
-    //typename thrust::iterator_value<RandomAccessIterator1>::type init = *first;
-    typename detail::scan_detail::scan_intermediate<
-      RandomAccessIterator1,
-      RandomAccessIterator2,
-      BinaryFunction
-    >::type init = *first;
-
-    // we need to wait because first may be the same as result
-    this_group.wait();
-
-    if(this_group.this_exec.index() == 0)
-    {
-      *result = init;
-    } // end if
-
-    bulk::inclusive_scan(this_group, first + 1, last, result + 1, init, binary_op);
-  } // end if
-
-  return result + (last - first);
-} // end inclusive_scan()
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryFunction>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize,
-  RandomAccessIterator2
->::type
-exclusive_scan(bulk::bounded<
-                 bound,
-                 bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-               > &g,
-               RandomAccessIterator1 first, RandomAccessIterator1 last,
-               RandomAccessIterator2 result,
-               T carry_in,
-               BinaryFunction binary_op)
-{
-  detail::scan_detail::scan<true>(g, first, last, result, carry_in, binary_op);
-  return result + (last - first);
-} // end exclusive_scan()
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename T,
-         typename BinaryFunction>
-__device__
-typename thrust::detail::enable_if<
-  (groupsize > 0),
-  RandomAccessIterator2
->::type
-exclusive_scan(bulk::concurrent_group<agent<grainsize>,groupsize> &g,
-               RandomAccessIterator1 first, RandomAccessIterator1 last,
-               RandomAccessIterator2 result,
-               T init,
-               BinaryFunction binary_op)
-{
-  typedef detail::scan_detail::scan_buffer<groupsize,grainsize,RandomAccessIterator1,RandomAccessIterator2,BinaryFunction> buffer_type;
-
-#if __CUDA_ARCH__ >= 200
-  buffer_type *buffer = reinterpret_cast<buffer_type*>(bulk::malloc(g, sizeof(buffer_type)));
-
-  if(bulk::is_on_chip(buffer))
-  {
-    detail::scan_detail::scan_with_buffer<false>(g, first, last, result, init, binary_op, *bulk::on_chip_cast(buffer));
-  } // end if
-  else
-  {
-    detail::scan_detail::scan_with_buffer<false>(g, first, last, result, init, binary_op, *buffer);
-  } // end else
-
-  bulk::free(g, buffer);
-#else
-  __shared__ uninitialized<buffer_type> buffer;
-  detail::scan_detail::scan_with_buffer<false>(g, first, last, result, init, binary_op, buffer.get());
-#endif
-
-  return result + (last - first);
-} // end exclusive_scan()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp b/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp
deleted file mode 100644
index 3c8c77e15..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/scatter.hpp
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4>
-__forceinline__ __device__
-void scatter_if(const bounded<bound,agent<grainsize> > &exec,
-                RandomAccessIterator1 first,
-                RandomAccessIterator1 last,
-                RandomAccessIterator2 map,
-                RandomAccessIterator3 stencil,
-                RandomAccessIterator4 result)
-{
-  typedef int size_type;
-
-  size_type n = last - first;
-
-  for(size_type i = 0; i < bound; ++i)
-  {
-    if(i < n && stencil[i])
-    {
-      result[map[i]] = first[i];
-    } // end if
-  } // end for
-} // end scatter_if()
-
-
-template<std::size_t bound,
-         std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1, 
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize
->::type
-scatter_if(bulk::bounded<
-             bound,
-             bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-           > &g,
-           RandomAccessIterator1 first,
-           RandomAccessIterator1 last,
-           RandomAccessIterator2 map,
-           RandomAccessIterator3 stencil,
-           RandomAccessIterator4 result)
-{
-  typedef typename bulk::bounded<
-    bound,
-    bulk::concurrent_group<bulk::agent<grainsize>,groupsize>
-  >::size_type size_type;
-
-  size_type n = last - first;
-
-  size_type tid = g.this_exec.index();
-
-  // avoid branches when possible
-  if(n == bound)
-  {
-    for(size_type i = 0; i < g.this_exec.grainsize(); ++i)
-    {
-      size_type idx = g.size() * i + tid;
-
-      if(stencil[idx])
-      {
-        result[map[idx]] = first[idx];
-      } // end if
-    } // end for
-  } // end if
-  else if(n < bound)
-  {
-    for(size_type i = 0; i < g.this_exec.grainsize(); ++i)
-    {
-      size_type idx = g.size() * i + tid;
-
-      if(idx < (last - first) && stencil[idx])
-      {
-        result[map[idx]] = first[idx];
-      } // end if
-    } // end for
-  } // end if
-
-  g.wait();
-} // end scatter_if()
-
-
-template<std::size_t groupsize,
-         std::size_t grainsize,
-         typename RandomAccessIterator1, 
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename RandomAccessIterator4>
-__device__
-void scatter_if(bulk::concurrent_group<bulk::agent<grainsize>,groupsize> &g,
-                RandomAccessIterator1 first,
-                RandomAccessIterator1 last,
-                RandomAccessIterator2 map,
-                RandomAccessIterator3 stencil,
-                RandomAccessIterator4 result)
-{
-  typedef typename bulk::concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-
-  size_type chunk_size = g.size() * grainsize;
-
-  size_type n = last - first;
-
-  size_type tid = g.this_exec.index();
-
-  // important special case which avoids the expensive for loop below
-  if(chunk_size == n)
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type idx = g.size() * i + tid;
-
-      if(stencil[idx])
-      {
-        result[map[idx]] = first[idx];
-      } // end if
-    } // end for
-  } // end if
-  else if(n < chunk_size)
-  {
-    for(size_type i = 0; i < grainsize; ++i)
-    {
-      size_type idx = g.size() * i + tid;
-
-      if(idx < (last - first) && stencil[idx])
-      {
-        result[map[idx]] = first[idx];
-      } // end if
-    } // end for
-  } // end if
-  else
-  {
-    for(;
-        first < last;
-        first += chunk_size, map += chunk_size, stencil += chunk_size)
-    {
-      if((last - first) >= chunk_size)
-      {
-        // avoid conditional accesses when possible
-        for(size_type i = 0; i < grainsize; ++i)
-        {
-          size_type idx = g.size() * i + tid;
-
-          if(stencil[idx])
-          {
-            result[map[idx]] = first[idx];
-          } // end if
-        } // end for
-      } // end if
-      else
-      {
-        for(size_type i = 0; i < grainsize; ++i)
-        {
-          size_type idx = g.size() * i + tid;
-
-          if(idx < (last - first) && stencil[idx])
-          {
-            result[map[idx]] = first[idx];
-          } // end if
-        } // end for
-      } // end else
-    } // end for
-  } // end else
-
-  g.wait();
-} // end scatter_if
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/algorithm/sort.hpp b/thrust/system/cuda/detail/bulk/algorithm/sort.hpp
deleted file mode 100644
index 1874ac7d6..000000000
--- a/thrust/system/cuda/detail/bulk/algorithm/sort.hpp
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm/detail/stable_merge_sort.hpp>
-#include <thrust/detail/swap.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-namespace sort_detail
-{
-
-
-template<int i, int bound>
-struct stable_odd_even_transpose_sort_by_key_impl
-{
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
-  static __device__
-  void sort(RandomAccessIterator1 keys, RandomAccessIterator2 values, int n, Compare comp)
-  {
-    for(int j = 1 & i; j < bound - 1; j += 2)
-    {
-      if(j + 1 < n && comp(keys[j + 1], keys[j]))
-      {
-        using thrust::swap;
-
-      	swap(keys[j], keys[j + 1]);
-      	swap(values[j], values[j + 1]);
-      }
-    }
-
-    stable_odd_even_transpose_sort_by_key_impl<i + 1, bound>::sort(keys, values, n, comp);
-  }
-};
-
-
-template<int i> struct stable_odd_even_transpose_sort_by_key_impl<i, i>
-{
-  template<typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
-  static __device__ void sort(RandomAccessIterator1, RandomAccessIterator2, int, Compare) { }
-};
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__forceinline__ __device__
-void stable_odd_even_transpose_sort_by_key(const bounded<bound,agent<grainsize> > &,
-                                           RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                                           RandomAccessIterator2 values_first,
-                                           Compare comp)
-{
-  stable_odd_even_transpose_sort_by_key_impl<0, bound>::sort(keys_first, values_first, keys_last - keys_first, comp);
-} // end stable_odd_even_transpose_sort_by_key()
-
-
-template<int i, int bound>
-struct stable_odd_even_transpose_sort_impl
-{
-  template<typename RandomAccessIterator, typename Compare>
-  static __device__
-  void sort(RandomAccessIterator keys, int n, Compare comp)
-  {
-    for(int j = 1 & i; j < bound - 1; j += 2)
-    {
-      if(j + 1 < n && comp(keys[j + 1], keys[j]))
-      {
-        using thrust::swap;
-
-      	swap(keys[j], keys[j + 1]);
-      }
-    }
-
-    stable_odd_even_transpose_sort_impl<i + 1, bound>::sort(keys, n, comp);
-  }
-};
-
-
-template<int i> struct stable_odd_even_transpose_sort_impl<i, i>
-{
-  template<typename RandomAccessIterator, typename Compare>
-  static __device__ void sort(RandomAccessIterator, int, Compare) { }
-};
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator,
-         typename Compare>
-__forceinline__ __device__
-void stable_odd_even_transpose_sort(const bounded<bound,agent<grainsize> > &,
-                                    RandomAccessIterator first, RandomAccessIterator last,
-                                    Compare comp)
-{
-  stable_odd_even_transpose_sort_impl<0, bound>::sort(first, last - first, comp);
-} // end stable_odd_even_transpose_sort()
-
-
-} // end sort_detail
-} // end detail
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__forceinline__ __device__
-void stable_sort_by_key(const bounded<bound,agent<grainsize> > &exec,
-                        RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        Compare comp)
-{
-  bulk::detail::sort_detail::stable_odd_even_transpose_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-} // end stable_sort_by_key()
-
-
-template<std::size_t bound,
-         std::size_t grainsize,
-         typename RandomAccessIterator,
-         typename Compare>
-__forceinline__ __device__
-void stable_sort(const bounded<bound,agent<grainsize> > &exec,
-                 RandomAccessIterator first, RandomAccessIterator last,
-                 Compare comp)
-{
-  bulk::detail::sort_detail::stable_odd_even_transpose_sort(exec, first, last, comp);
-} // end stable_sort()
-
-
-template<std::size_t bound, std::size_t groupsize, std::size_t grainsize,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__device__
-typename thrust::detail::enable_if<
-  bound <= groupsize * grainsize
->::type
-stable_sort_by_key(bulk::bounded<bound,bulk::concurrent_group<bulk::agent<grainsize>,groupsize> > &g,
-                   RandomAccessIterator1 keys_first, RandomAccessIterator1 keys_last,
-                   RandomAccessIterator2 values_first,
-                   Compare comp)
-{
-  bulk::detail::stable_merge_sort_by_key(g, keys_first, keys_last, values_first, comp);
-} // end stable_sort_by_key()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/async.hpp b/thrust/system/cuda/detail/bulk/async.hpp
deleted file mode 100644
index f3ee5e594..000000000
--- a/thrust/system/cuda/detail/bulk/async.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/future.hpp>
-#include <thrust/detail/config.h>
-#include <thrust/detail/cstdint.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename ExecutionGroup, typename Function>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9);
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9, Arg10 arg10);
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
-#include <thrust/system/cuda/detail/bulk/detail/async.inl>
-
diff --git a/thrust/system/cuda/detail/bulk/bulk.hpp b/thrust/system/cuda/detail/bulk/bulk.hpp
deleted file mode 100644
index b65b8c468..000000000
--- a/thrust/system/cuda/detail/bulk/bulk.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/choose_sizes.hpp>
-#include <thrust/system/cuda/detail/bulk/future.hpp>
-#include <thrust/system/cuda/detail/bulk/async.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/algorithm.hpp>
-#include <thrust/system/cuda/detail/bulk/iterator.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-
diff --git a/thrust/system/cuda/detail/bulk/choose_sizes.hpp b/thrust/system/cuda/detail/bulk/choose_sizes.hpp
deleted file mode 100644
index 43bac6b23..000000000
--- a/thrust/system/cuda/detail/bulk/choose_sizes.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/pair.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename Function>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f);
-
-
-template<typename Function, typename Arg1>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1);
-
-
-template<typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2);
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3);
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4);
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5);
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6);
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
-#include <thrust/system/cuda/detail/bulk/detail/choose_sizes.inl>
-
diff --git a/thrust/system/cuda/detail/bulk/detail/alignment.hpp b/thrust/system/cuda/detail/bulk/detail/alignment.hpp
deleted file mode 100644
index bf8d230ab..000000000
--- a/thrust/system/cuda/detail/bulk/detail/alignment.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-namespace alignment_of_detail
-{
-
-
-template<typename T> class alignment_of_impl;
-
-template<typename T, std::size_t size_diff>
-  struct helper
-{
-  static const std::size_t value = size_diff;
-};
-
-template<typename T>
-  class helper<T,0>
-{
-  public:
-    static const std::size_t value = alignment_of_impl<T>::value;
-};
-
-template<typename T>
-  class alignment_of_impl
-{
-  private:
-    struct big { T x; char c; };
-
-  public:
-    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
-};
-
-
-} // end alignment_of_detail
-
-
-template<typename T>
-  struct alignment_of
-    : alignment_of_detail::alignment_of_impl<T>
-{};
-
-
-template<std::size_t Align> struct aligned_type;
-
-// __align__ is CUDA-specific, so guard it
-#if defined(__CUDACC__)
-
-// implementing aligned_type portably is tricky:
-
-#  if defined(_MSC_VER)
-// implement aligned_type with specialization because MSVC
-// requires literals as arguments to declspec(align(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-template<> struct aligned_type<256>
-{
-  struct __align__(256) type { };
-};
-
-template<> struct aligned_type<512>
-{
-  struct __align__(512) type { };
-};
-
-template<> struct aligned_type<1024>
-{
-  struct __align__(1024) type { };
-};
-
-template<> struct aligned_type<2048>
-{
-  struct __align__(2048) type { };
-};
-
-template<> struct aligned_type<4096>
-{
-  struct __align__(4096) type { };
-};
-
-template<> struct aligned_type<8192>
-{
-  struct __align__(8192) type { };
-};
-#  elif defined(__GNUC__) && ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) < 40600)
-// implement aligned_type with specialization because older gcc
-// requires literals as arguments to __attribute__(aligned(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-#  else
-// assume the compiler allows template parameters as
-// arguments to __align__ 
-template<std::size_t Align> struct aligned_type
-{
-  struct __align__(Align) type { };
-};
-#  endif // THRUST_HOST_COMPILER
-#else
-template<std::size_t Align> struct aligned_type
-{
-  struct type { };
-};
-#endif // THRUST_DEVICE_COMPILER
-
-
-template<std::size_t Len, std::size_t Align>
-  struct aligned_storage
-{
-  union type
-  {
-    unsigned char data[Len];
-
-    typename aligned_type<Align>::type align;
-  };
-};
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp b/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp
deleted file mode 100644
index 62979731a..000000000
--- a/thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/tuple.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename Function>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<> &)
-{
-  f();
-}
-
-
-template<typename Function, typename Arg1>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1> &args)
-{
-  f(thrust::get<0>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args),
-    thrust::get<5>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args),
-    thrust::get<5>(args),
-    thrust::get<6>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args),
-    thrust::get<5>(args),
-    thrust::get<6>(args),
-    thrust::get<7>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args),
-    thrust::get<5>(args),
-    thrust::get<6>(args),
-    thrust::get<7>(args),
-    thrust::get<8>(args));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-__host__ __device__
-void apply_from_tuple(Function f, const thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9,Arg10> &args)
-{
-  f(thrust::get<0>(args),
-    thrust::get<1>(args),
-    thrust::get<2>(args),
-    thrust::get<3>(args),
-    thrust::get<4>(args),
-    thrust::get<5>(args),
-    thrust::get<6>(args),
-    thrust::get<7>(args),
-    thrust::get<8>(args),
-    thrust::get<9>(args));
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/async.inl b/thrust/system/cuda/detail/bulk/detail/async.inl
deleted file mode 100644
index 09c4f3f15..000000000
--- a/thrust/system/cuda/detail/bulk/detail/async.inl
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/async.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/closure.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/terminate.hpp>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename ExecutionGroup, typename Closure>
-__host__ __device__
-future<void> async_in_stream(ExecutionGroup g, Closure c, cudaStream_t s, cudaEvent_t before_event)
-{
-#if __BULK_HAS_CUDART__
-  if(before_event != 0)
-  {
-    bulk::detail::throw_on_error(cudaStreamWaitEvent(s, before_event, 0), "cudaStreamWaitEvent in async_in_stream");
-  }
-#else
-  bulk::detail::terminate_with_message("async_in_stream(): cudaStreamWaitEvent requires CUDART");
-#endif
-
-  bulk::detail::cuda_launcher<ExecutionGroup, Closure> launcher;
-  launcher.launch(g, c, s);
-
-  return future_core_access::create(s, false);
-} // end async_in_stream()
-
-
-template<typename ExecutionGroup, typename Closure>
-__host__ __device__
-future<void> async(ExecutionGroup g, Closure c, cudaEvent_t before_event)
-{
-  cudaStream_t s;
-
-  // XXX cudaStreamCreate is __host__-only
-  //     figure out a way to support this that does not require creating a new stream
-#if (__BULK_HAS_CUDART__ && !defined(__CUDA_ARCH__))
-  bulk::detail::throw_on_error(cudaStreamCreate(&s), "cudaStreamCreate in bulk::detail::async");
-#else
-  s = 0;
-  bulk::detail::terminate_with_message("bulk::async(): cudaStreamCreate() is unsupported in __device__ code.");
-#endif
-
-#if __BULK_HAS_CUDART__
-  if(before_event != 0)
-  {
-    bulk::detail::throw_on_error(cudaStreamWaitEvent(s, before_event, 0), "cudaStreamWaitEvent in bulk::detail::async");
-  }
-#else
-  bulk::detail::terminate_with_message("async_in_stream(): cudaStreamWaitEvent requires CUDART");
-#endif
-
-  bulk::detail::cuda_launcher<ExecutionGroup, Closure> launcher;
-  launcher.launch(g, c, s);
-
-  // note we pass true here, unlike false above
-  return future_core_access::create(s, true);
-} // end async()
-
-
-template<typename ExecutionGroup, typename Closure>
-__host__ __device__
-future<void> async(ExecutionGroup g, Closure c)
-{
-  return bulk::detail::async_in_stream(g, c, 0, 0);
-} // end async()
-
-
-template<typename ExecutionGroup, typename Closure>
-__host__ __device__
-future<void> async(async_launch<ExecutionGroup> launch, Closure c)
-{
-  return launch.is_stream_valid() ?
-    bulk::detail::async_in_stream(launch.exec(), c, launch.stream(), launch.before_event()) :
-    bulk::detail::async(launch.exec(), c, launch.before_event());
-} // end async()
-
-
-} // end detail
-
-
-template<typename ExecutionGroup, typename Function>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f)
-{
-  return bulk::detail::async(g, detail::make_closure(f));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9));
-} // end async()
-
-
-template<typename ExecutionGroup, typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-__host__ __device__
-future<void> async(ExecutionGroup g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9, Arg10 arg10)
-{
-  return bulk::detail::async(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10));
-} // end async()
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl b/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl
deleted file mode 100644
index ca9d678b8..000000000
--- a/thrust/system/cuda/detail/bulk/detail/choose_sizes.inl
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/choose_sizes.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/closure.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename Closure>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Closure)
-{
-  bulk::detail::cuda_launcher<
-    parallel_group<concurrent_group<> >,
-    Closure
-  > launcher;
-
-  return launcher.choose_sizes(g.size(), g.this_exec.size());
-} // end choose_sizes()
-
-
-} // end detail
-
-
-template<typename Function>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f));
-}
-
-
-template<typename Function, typename Arg1>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-thrust::pair<typename parallel_group<concurrent_group<> >::size_type,
-             typename concurrent_group<>::size_type>
-  choose_sizes(parallel_group<concurrent_group<> > g, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6)
-{
-  return bulk::detail::choose_sizes(g, detail::make_closure(f,arg1,arg2,arg3,arg4,arg5,arg6));
-}
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/closure.hpp b/thrust/system/cuda/detail/bulk/detail/closure.hpp
deleted file mode 100644
index 63864a9d3..000000000
--- a/thrust/system/cuda/detail/bulk/detail/closure.hpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/apply_from_tuple.hpp>
-
-#include <thrust/detail/config.h>
-#include <thrust/tuple.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename Function, typename Tuple>
-class closure
-{
-  public:
-    typedef Function function_type;
-
-    typedef Tuple arguments_type;
-
-    __host__ __device__
-    closure(function_type f, const arguments_type &args)
-      :f(f),
-       args(args)
-    {}
-
-
-    __host__ __device__
-    void operator()()
-    {
-      apply_from_tuple(f,args);
-    }
-
-
-    __host__ __device__
-    function_type function() const
-    {
-      return f;
-    }
-
-
-    __host__ __device__
-    arguments_type arguments() const
-    {
-      return args;
-    }
-
-
-  private:
-    function_type   f;
-    arguments_type args;
-}; // end closure
-
-
-template<typename Function, typename Arguments>
-__host__ __device__
-const closure<Function,Arguments> &make_closure(const closure<Function,Arguments> &c)
-{
-  return c;
-}
-
-
-template<typename Function>
-__host__ __device__
-closure<Function, thrust::tuple<> > make_closure(Function f)
-{
-  return closure<Function,thrust::tuple<> >(f, thrust::tuple<>());
-}
-
-
-template<typename Function, typename Arg1>
-__host__ __device__
-closure<Function, thrust::tuple<Arg1> > make_closure(Function f, const Arg1 &a1)
-{
-  return closure<Function,thrust::tuple<Arg1> >(f, thrust::make_tuple(a1));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2> >(f, thrust::make_tuple(a1,a2));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3> >(f, thrust::make_tuple(a1,a2,a3));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4> >(f, thrust::make_tuple(a1,a2,a3,a4));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5> >(f, thrust::make_tuple(a1,a2,a3,a4,a5));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6> >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7> >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8> >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9> >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8,a9));
-}
-
-
-template<typename Function, typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-__host__ __device__
-closure<
-  Function,
-  thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9,Arg10>
->
-  make_closure(Function f, const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10)
-{
-  return closure<Function,thrust::tuple<Arg1,Arg2,Arg3,Arg4,Arg5,Arg6,Arg7,Arg8,Arg9,Arg10> >(f, thrust::make_tuple(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10));
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/config.hpp b/thrust/system/cuda/detail/bulk/detail/config.hpp
deleted file mode 100644
index 0a9a1c24c..000000000
--- a/thrust/system/cuda/detail/bulk/detail/config.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#ifndef BULK_NAMESPACE_PREFIX
-#define BULK_NAMESPACE_PREFIX
-#endif
-
-#ifndef BULK_NAMESPACE_SUFFIX
-#define BULK_NAMESPACE_SUFFIX
-#endif
-
-#if defined(__CUDACC__)
-#  ifndef __bulk_hd_warning_disable__
-#    if __CUDAVER__ >= 75000
-#      define __bulk_hd_warning_disable__ #pragma nv_exec_check_disable
-#    else
-#      define __bulk_hd_warning_disable__ #pragma hd_warning_disable
-#    endif /* __CUDAVER__ */
-#  endif // __bulk_hd_warning_disable__
-#else
-#  define __bulk_hd_warning_disable__
-#endif // __bulk_hd_warning_disable__
-
-#include <thrust/version.h>
-
-#if THRUST_VERSION < 100800
-#error "Bulk requires Thrust v1.8 (http://thrust.github.io) or better."
-#endif
-
-
-#if defined(__CUDACC__)
-#  if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
-#    define __BULK_HAS_CUDART__ 1
-#  else
-#    define __BULK_HAS_CUDART__ 0
-#  endif
-#else
-#  define __BULK_HAS_CUDART__ 0
-#endif
-
-#if defined(__CUDACC__)
-#  if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200)
-#    define __BULK_HAS_PRINTF__ 1
-#  else
-#    define __BULK_HAS_PRINTF__ 0
-#  endif
-#else
-#  define __BULK_HAS_PRINTF__ 1
-#endif
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp
deleted file mode 100644
index 5b577ee92..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-// XXX all of this functionality needs to be thrown out and replaced
-//     with the built-in occupancy stuff
-
-#include <cstddef>
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/detail/minmax.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct device_properties_t
-{
-  // mirror the type and spelling of cudaDeviceProp's members
-  // keep these alphabetized
-  int    major;
-  int    maxGridSize[3];
-  int    maxThreadsPerBlock;
-  int    maxThreadsPerMultiProcessor;
-  int    minor;
-  int    multiProcessorCount;
-  int    regsPerBlock;
-  size_t sharedMemPerBlock;
-  int    warpSize;
-};
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct function_attributes_t
-{
-  // mirror the type and spelling of cudaFuncAttributes' members
-  // keep these alphabetized
-  size_t constSizeBytes;
-  size_t localSizeBytes;
-  int    maxThreadsPerBlock;
-  int    numRegs;
-  int    ptxVersion;
-  size_t sharedSizeBytes;
-};
-
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- *  \note The __global__ function of interest is presumed to use 0 bytes of dynamically-allocated __shared__ memory.
- */
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties);
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  Use this version of the function when a CUDA block's dynamically-allocated __shared__ memory requirements
- *  vary with the size of the block.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \param block_size_to_dynamic_smem_bytes A unary function which maps an integer CUDA block size to the number of bytes
- *         of dynamically-allocated __shared__ memory required by a CUDA block of that size.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- */
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size);
-
-
-/*! Returns the maximum amount of dynamic shared memory each block
- *  can utilize without reducing thread occupancy.
- *
- *  \param properties CUDA device properties
- *  \param attributes CUDA function attributes
- *  \param blocks_per_processor Number of blocks per streaming multiprocessor
- */
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor);
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage);
-
-
-
-namespace cuda_launch_config_detail
-{
-
-using std::size_t;
-
-namespace util
-{
-
-
-template<typename T>
-inline __host__ __device__
-T min_(const T &lhs, const T &rhs)
-{
-  return rhs < lhs ? rhs : lhs;
-}
-
-
-template <typename T>
-struct zero_function
-{
-  inline __host__ __device__
-  T operator()(T)
-  {
-    return 0;
-  }
-};
-
-
-// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_ri(const L x, const R y)
-{
-    return (x + (y - 1)) / y;
-}
-
-// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_rz(const L x, const R y)
-{
-    return x / y;
-}
-
-// round x towards infinity to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
-
-// round x towards zero to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
-
-} // end namespace util
-
-
-
-// granularity of shared memory allocation
-inline __host__ __device__
-size_t smem_allocation_unit(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 512;
-    case 2:  return 128;
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of register allocation
-inline __host__ __device__
-int reg_allocation_unit(const device_properties_t &properties, const size_t regsPerThread)
-{
-  switch(properties.major)
-  {
-    case 1:  return (properties.minor <= 1) ? 256 : 512;
-    case 2:  switch(regsPerThread)
-             {
-               case 21:
-               case 22:
-               case 29:
-               case 30:
-               case 37:
-               case 38:
-               case 45:
-               case 46:
-                 return 128;
-               default:
-                 return 64;
-             }
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of warp allocation
-inline __host__ __device__
-size_t warp_allocation_multiple(const device_properties_t &properties)
-{
-  return (properties.major <= 1) ? 2 : 1;
-}
-
-// number of "sides" into which the multiprocessor is partitioned
-inline __host__ __device__
-size_t num_sides_per_multiprocessor(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 1;
-    case 2:  return 2;
-    case 3:  return 4;
-    default: return 4; // unknown GPU; have to guess
-  }
-}
-
-
-inline __host__ __device__
-size_t max_blocks_per_multiprocessor(const device_properties_t &properties)
-{
-  return (properties.major <= 2) ? 8 : 16;
-}
-
-
-inline __host__ __device__
-size_t max_active_blocks_per_multiprocessor(const device_properties_t    &properties,
-                                            const function_attributes_t  &attributes,
-                                            size_t CTA_SIZE,
-                                            size_t dynamic_smem_bytes)
-{
-  // Determine the maximum number of CTAs that can be run simultaneously per SM
-  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
-
-  //////////////////////////////////////////
-  // Limits due to threads/SM or blocks/SM
-  //////////////////////////////////////////
-  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;  // 768, 1024, 1536, etc.
-  const size_t maxBlocksPerSM  = max_blocks_per_multiprocessor(properties);
-
-  // Calc limits
-  const size_t ctaLimitThreads = (CTA_SIZE <= size_t(properties.maxThreadsPerBlock)) ? maxThreadsPerSM / CTA_SIZE : 0;
-  const size_t ctaLimitBlocks  = maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to shared memory/SM
-  //////////////////////////////////////////
-  const size_t smemAllocationUnit     = smem_allocation_unit(properties);
-  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;
-  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);
-
-  // Calc limit
-  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to registers/SM
-  //////////////////////////////////////////
-  const int regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs);
-  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
-  const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
-
-  // Calc limit
-  size_t ctaLimitRegs;
-  if(properties.major <= 1)
-  {
-    // GPUs of compute capability 1.x allocate registers to CTAs
-    // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
-    const size_t regsPerCTA = util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
-    ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
-  }
-  else
-  {
-    // GPUs of compute capability 2.x and higher allocate registers to warps
-    // Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit
-    const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
-    const size_t numSides = num_sides_per_multiprocessor(properties);
-    const size_t numRegsPerSide = properties.regsPerBlock / numSides;
-    ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
-  }
-
-  //////////////////////////////////////////
-  // Overall limit is min() of limits due to above reasons
-  //////////////////////////////////////////
-  return util::min_(ctaLimitRegs, util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));
-}
-
-
-} // end namespace cuda_launch_config_detail
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size)
-{
-  size_t max_occupancy      = properties.maxThreadsPerMultiProcessor;
-  size_t largest_blocksize  = cuda_launch_config_detail::util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity        = properties.warpSize;
-  size_t max_blocksize      = 0;
-  size_t highest_occupancy  = 0;
-
-  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
-  {
-    size_t occupancy = blocksize * cuda_launch_config_detail::max_active_blocks_per_multiprocessor(properties, attributes, blocksize, block_size_to_dynamic_smem_size(blocksize));
-
-    if(occupancy > highest_occupancy)
-    {
-      max_blocksize = blocksize;
-      highest_occupancy = occupancy;
-    }
-
-    // early out, can't do better
-    if(highest_occupancy == max_occupancy)
-      break;
-  }
-
-  return max_blocksize;
-}
-
-
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties)
-{
-  return block_size_with_maximum_potential_occupancy(attributes, properties, cuda_launch_config_detail::util::zero_function<std::size_t>());
-}
-
-
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor)
-{
-  size_t smem_per_processor    = properties.sharedMemPerBlock;
-  size_t smem_allocation_unit  = cuda_launch_config_detail::smem_allocation_unit(properties);
-
-  size_t total_smem_per_block  = cuda_launch_config_detail::util::round_z(smem_per_processor / blocks_per_processor, smem_allocation_unit);
-  size_t static_smem_per_block = attributes.sharedSizeBytes;
-  
-  return total_smem_per_block - static_smem_per_block;
-}
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage)
-{
-  size_t largest_blocksize = (thrust::min)(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity = properties.warpSize;
-  
-  for(int blocksize = largest_blocksize; blocksize > 0; blocksize -= granularity)
-  {
-    size_t total_smem_usage = blocksize_to_dynamic_smem_usage(blocksize) + attributes.sharedSizeBytes;
-
-    if(total_smem_usage <= properties.sharedMemPerBlock)
-    {
-      return blocksize;
-    }
-  }
-
-  return 0;
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp
deleted file mode 100644
index ecdff761f..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launcher.hpp
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/alignment.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_task.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/synchronize.hpp>
-#include <thrust/detail/minmax.h>
-#include <thrust/pair.h>
-
-
-// It's not possible to launch a CUDA kernel unless __BULK_HAS_CUDART__
-// is 1, so we'd like to just hide all this code when that macro is 0.
-// Unfortunately, we can't actually modulate kernel launches based on that macro
-// because that will hide __global__ function template instantiations from critical
-// nvcc compilation phases. This means that nvcc won't actually place the kernel in the
-// binary and we'll get an undefined __global__ function error at runtime.
-// So we allow the user to unconditionally create instances of classes like cuda_launcher
-// even though the member function .launch(...) isn't always available.
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-// XXX instead of passing block_size_ as a template parameter to cuda_launcher_base,
-//     find a way to fish it out of ExecutionGroup
-template<unsigned int block_size_, typename ExecutionGroup, typename Closure>
-struct cuda_launcher_base
-  : public triple_chevron_launcher<
-      block_size_,
-      cuda_task<ExecutionGroup,Closure>
-    >
-{
-  typedef triple_chevron_launcher<block_size_, cuda_task<ExecutionGroup,Closure> > super_t;
-  typedef typename super_t::task_type                                              task_type;
-  typedef typename ExecutionGroup::size_type                                       size_type;
-
-
-  __host__ __device__
-  cuda_launcher_base()
-    : m_device_properties(bulk::detail::device_properties())
-  {}
-
-
-  __host__ __device__
-  void launch(size_type num_blocks, size_type block_size, size_type num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
-  {
-    if(num_blocks > 0)
-    {
-      super_t::launch(num_blocks, block_size, num_dynamic_smem_bytes, stream, task);
-
-      bulk::detail::synchronize_if_enabled("bulk_kernel_by_value");
-    } // end if
-  } // end launch()
-
-
-  __host__ __device__
-  static size_type max_active_blocks_per_multiprocessor(const device_properties_t &props,
-                                                        const function_attributes_t &attr,
-                                                        size_type num_threads_per_block,
-                                                        size_type num_smem_bytes_per_block)
-  {
-    return static_cast<size_type>(bulk::detail::cuda_launch_config_detail::max_active_blocks_per_multiprocessor(props, attr, num_threads_per_block, num_smem_bytes_per_block));
-  } // end max_active_blocks_per_multiprocessor()
-
-
-  // returns
-  // 1. maximum number of additional dynamic smem bytes that would not lower the kernel's occupancy
-  // 2. kernel occupancy
-  __host__ __device__
-  static thrust::pair<size_type,size_type> dynamic_smem_occupancy_limit(const device_properties_t &props, const function_attributes_t &attr, size_type num_threads_per_block, size_type num_smem_bytes_per_block)
-  {
-    // figure out the kernel's occupancy with 0 bytes of dynamic smem
-    size_type occupancy = max_active_blocks_per_multiprocessor(props, attr, num_threads_per_block, num_smem_bytes_per_block);
-
-    // if the kernel footprint is already too large, return (0,0)
-    if(occupancy < 1) return thrust::make_pair(0,0);
-
-    return thrust::make_pair(static_cast<size_type>(bulk::detail::proportional_smem_allocation(props, attr, occupancy)), occupancy);
-  } // end smem_occupancy_limit()
-
-
-  __host__ __device__
-  size_type choose_heap_size(const device_properties_t &props, size_type group_size, size_type requested_size)
-  {
-    function_attributes_t attr = bulk::detail::function_attributes(super_t::global_function_pointer());
-
-    // if the kernel's ptx version is < 200, we return 0 because there is no heap
-    // if the user requested no heap, give him no heap
-    if(attr.ptxVersion < 20 || requested_size == 0)
-    {
-      return 0;
-    } // end if
-
-    // how much smem could we allocate without reducing occupancy?
-    size_type result = 0, occupancy = 0;
-    thrust::tie(result,occupancy) = dynamic_smem_occupancy_limit(props, attr, group_size, 0);
-
-    // let's try to increase the heap size, but only if the following are true:
-    // 1. the user asked for more heap than the default
-    // 2. there's occupancy to spare
-    if(requested_size != use_default && requested_size > result && occupancy > 1)
-    {
-      // first add in a few bytes to the request for the heap data structure
-      requested_size += 48;
-
-      // are we asking for more heap than is available at this occupancy level?
-      if(requested_size > result)
-      {
-        // the request overflows occupancy, so we might as well bump it to the next level
-        size_type next_level_result = 0, next_level_occupancy = 0;
-        thrust::tie(next_level_result, next_level_occupancy) = dynamic_smem_occupancy_limit(props, attr, group_size, requested_size);
-
-        // if we didn't completely overflow things, use this new heap size
-        // otherwise, the heap remains the default size
-        if(next_level_occupancy > 0) result = next_level_result;
-      } // end else
-    } // end i
-
-    return result;
-  } // end choose_smem_size()
-
-
-  __host__ __device__
-  size_type choose_group_size(size_type requested_size)
-  {
-    size_type result = requested_size;
-
-    if(result == use_default)
-    {
-      bulk::detail::function_attributes_t attr = bulk::detail::function_attributes(super_t::global_function_pointer());
-
-      return static_cast<size_type>(bulk::detail::block_size_with_maximum_potential_occupancy(attr, device_properties()));
-    } // end if
-
-    return result;
-  } // end choose_group_size()
-
-
-  __host__ __device__
-  size_type choose_subscription(size_type block_size)
-  {
-    // given no other info, this is a reasonable guess
-    return block_size > 0 ? device_properties().maxThreadsPerMultiProcessor / block_size : 0;
-  }
-
-
-  __host__ __device__
-  size_type choose_num_groups(size_type requested_num_groups, size_type group_size)
-  {
-    size_type result = requested_num_groups;
-
-    if(result == use_default)
-    {
-      // given no other info, a reasonable number of groups
-      // would simply occupy the machine as well as possible
-      size_type subscription = choose_subscription(group_size);
-
-      result = thrust::min<size_type>(subscription * device_properties().multiProcessorCount, max_physical_grid_size());
-    } // end if
-
-    return result;
-  } // end choose_num_groups()
-
-
-  __host__ __device__
-  size_type max_physical_grid_size()
-  {
-    // get the limit of the actual device
-    int actual_limit = device_properties().maxGridSize[0];
-
-    // get the limit of the PTX version of the kernel
-    int ptx_version = bulk::detail::function_attributes(super_t::global_function_pointer()).ptxVersion;
-
-    int ptx_limit = 0;
-
-    // from table 9 of the CUDA C Programming Guide
-    if(ptx_version < 30)
-    {
-      ptx_limit = 65535;
-    } // end if
-    else
-    {
-      ptx_limit = (1u << 31) - 1;
-    } // end else
-
-    return thrust::min<size_type>(actual_limit, ptx_limit);
-  } // end max_physical_grid_size()
-
-
-  __host__ __device__
-  const device_properties_t &device_properties() const
-  {
-    return m_device_properties;
-  }
-
-
-  device_properties_t m_device_properties;
-}; // end cuda_launcher_base
-
-
-template<typename ExecutionGroup, typename Closure> struct cuda_launcher;
-
-
-template<std::size_t gridsize, std::size_t blocksize, std::size_t grainsize, typename Closure>
-struct cuda_launcher<
-  parallel_group<
-    concurrent_group<
-      agent<grainsize>,
-      blocksize
-    >,
-    gridsize
-  >,
-  Closure
->
-  : public cuda_launcher_base<blocksize, typename cuda_grid<gridsize,blocksize,grainsize>::type,Closure>
-{
-  typedef cuda_launcher_base<blocksize, typename cuda_grid<gridsize,blocksize,grainsize>::type,Closure> super_t;
-  typedef typename super_t::size_type size_type;
-
-  typedef typename cuda_grid<gridsize,blocksize,grainsize>::type grid_type;
-  typedef typename grid_type::agent_type                         block_type;
-  typedef typename block_type::agent_type                        thread_type;
-
-  typedef typename super_t::task_type task_type;
-
-  // launch(...) requires CUDA launch capability
-  __host__ __device__
-  void launch(grid_type request, Closure c, cudaStream_t stream)
-  {
-    grid_type g = configure(request);
-
-    size_type num_blocks = g.size();
-    size_type block_size = g.this_exec.size();
-
-    if(num_blocks > 0 && block_size > 0)
-    {
-      size_type heap_size  = g.this_exec.heap_size();
-
-      size_type max_physical_grid_size = super_t::max_physical_grid_size();
-
-      // launch multiple grids in order to accomodate potentially too large grid size requests
-      // XXX these will all go in sequential order in the same stream, even though they are logically
-      //     parallel
-      if(block_size > 0)
-      {
-        size_type num_remaining_physical_blocks = num_blocks;
-        for(size_type block_offset = 0;
-            block_offset < num_blocks;
-            block_offset += max_physical_grid_size)
-        {
-          task_type task(g, c, block_offset);
-
-          size_type num_physical_blocks = thrust::min<size_type>(num_remaining_physical_blocks, max_physical_grid_size);
-
-          super_t::launch(num_physical_blocks, block_size, heap_size, stream, task);
-
-          num_remaining_physical_blocks -= num_physical_blocks;
-        } // end for block_offset
-      } // end if
-    } // end if
-  } // end go()
-
-  __host__ __device__
-  grid_type configure(grid_type g)
-  {
-    size_type block_size = super_t::choose_group_size(g.this_exec.size());
-    size_type heap_size  = super_t::choose_heap_size(device_properties(), block_size, g.this_exec.heap_size());
-    size_type num_blocks = g.size();
-
-    return make_grid<grid_type>(num_blocks, make_block<block_type>(block_size, heap_size));
-  } // end configure()
-
-  // chooses a number of groups and a group size
-  __host__ __device__
-  thrust::pair<size_type, size_type> choose_sizes(size_type requested_num_groups, size_type requested_group_size)
-  {
-    // if a static blocksize is set, we ignore the requested group size
-    // and just use the static value
-    size_type group_size = blocksize;
-    if(group_size == 0)
-    {
-      group_size = super_t::choose_group_size(requested_group_size);
-    } // end if
-
-    // if a static gridsize is set, we ignore the requested group size
-    // and just use the static value
-    size_type num_groups = gridsize;
-    if(num_groups == 0)
-    {
-      num_groups = super_t::choose_num_groups(requested_num_groups, group_size);
-    } // end if
-
-    return thrust::make_pair(num_groups, group_size);
-  } // end choose_sizes()
-}; // end cuda_launcher
-
-
-template<std::size_t blocksize, std::size_t grainsize, typename Closure>
-struct cuda_launcher<
-  concurrent_group<
-    agent<grainsize>,
-    blocksize
-  >,
-  Closure
->
-  : public cuda_launcher_base<blocksize,concurrent_group<agent<grainsize>,blocksize>,Closure>
-{
-  typedef cuda_launcher_base<blocksize,concurrent_group<agent<grainsize>,blocksize>,Closure> super_t;
-  typedef typename super_t::size_type size_type;
-  typedef typename super_t::task_type task_type;
-
-  typedef concurrent_group<agent<grainsize>,blocksize> block_type;
-
-  __host__ __device__
-  void launch(block_type request, Closure c, cudaStream_t stream)
-  {
-    block_type b = configure(request);
-
-    size_type block_size = b.size();
-    size_type heap_size  = b.heap_size();
-
-    if(block_size > 0)
-    {
-      task_type task(b, c);
-      super_t::launch(1, block_size, heap_size, stream, task);
-    } // end if
-  } // end go()
-
-  __host__ __device__
-  block_type configure(block_type b)
-  {
-    size_type block_size = super_t::choose_group_size(b.size());
-    size_type heap_size  = super_t::choose_heap_size(device_properties(), block_size, b.heap_size());
-    return make_block<block_type>(block_size, heap_size);
-  } // end configure()
-}; // end cuda_launcher
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename Closure>
-struct cuda_launcher<
-  parallel_group<
-    agent<grainsize>,
-    groupsize
-  >,
-  Closure
->
-  : public cuda_launcher_base<dynamic_group_size, parallel_group<agent<grainsize>,groupsize>,Closure>
-{
-  typedef cuda_launcher_base<dynamic_group_size, parallel_group<agent<grainsize>,groupsize>,Closure> super_t;
-  typedef typename super_t::size_type size_type; 
-  typedef typename super_t::task_type task_type;
-
-  typedef parallel_group<agent<grainsize>,groupsize> group_type;
-
-  __host__ __device__
-  void launch(group_type g, Closure c, cudaStream_t stream)
-  {
-    size_type num_blocks, block_size;
-    thrust::tie(num_blocks,block_size) = configure(g);
-
-    if(num_blocks > 0 && block_size > 0)
-    {
-      task_type task(g, c);
-
-      super_t::launch(num_blocks, block_size, 0, stream, task);
-    } // end if
-  } // end go()
-
-  __host__ __device__
-  thrust::tuple<size_type,size_type> configure(group_type g)
-  {
-    size_type block_size = thrust::min<size_type>(g.size(), super_t::choose_group_size(use_default));
-
-    // don't ask for more than a reasonable number of blocks
-    size_type max_blocks = super_t::choose_num_groups(bulk::use_default, block_size);
-
-    // given no limits at all, how many blocks would we launch?
-    size_type num_blocks = (block_size > 0) ? (g.size() + block_size - 1) / block_size : 0;
-
-    // don't ask for more blocks than the limit we prescribed for ourself
-    num_blocks = thrust::min<size_type>(num_blocks, max_blocks);
-
-    return thrust::make_tuple(num_blocks, block_size);
-  } // end configure()
-}; // end cuda_launcher
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp
deleted file mode 100644
index 37b372c20..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/terminate.hpp>
-#include <thrust/detail/swap.h>
-#include <cstring>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-// this thing has ownership semantics like unique_ptr, so copy and assign are more like moves
-template<typename T>
-class parameter_ptr
-{
-  public:
-    typedef T element_type;
-
-    __host__ __device__
-    explicit parameter_ptr(element_type *ptr)
-      : m_ptr(ptr)
-    {}
-
-    // XXX copy emulates a move
-    __host__ __device__
-    parameter_ptr(const parameter_ptr& other_)
-    {
-      parameter_ptr& other = const_cast<parameter_ptr&>(other_);
-      thrust::swap(m_ptr, other.m_ptr);
-    }
-
-    __host__ __device__
-    ~parameter_ptr()
-    {
-#if __BULK_HAS_CUDART__
-      if(m_ptr)
-      {
-        bulk::detail::terminate_on_error(cudaFree(m_ptr), "in parameter_ptr dtor");
-      }
-#else
-      bulk::detail::terminate_with_message("parameter_ptr dtor: cudaFree requires CUDART");
-#endif
-    }
-
-    // XXX assign emulates a move
-    __host__ __device__
-    parameter_ptr& operator=(const parameter_ptr& other_)
-    {
-      parameter_ptr& other = const_cast<parameter_ptr&>(other_);
-      thrust::swap(m_ptr, other.m_ptr);
-      return *this;
-    }
-
-    __host__ __device__
-    T* get() const
-    {
-      return m_ptr;
-    }
-
-  private:
-    T *m_ptr;
-};
-
-
-template<typename T>
-__host__ __device__
-parameter_ptr<T> make_parameter(const T& x)
-{
-  T* raw_ptr = 0;
-
-  // allocate
-#if __BULK_HAS_CUDART__
-  bulk::detail::throw_on_error(cudaMalloc(&raw_ptr, sizeof(T)), "make_parameter(): after cudaMalloc");
-#else
-  bulk::detail::terminate_with_message("make_parameter(): cudaMalloc requires CUDART\n");
-#endif
-
-  // do a trivial copy
-#ifndef __CUDA_ARCH__
-  bulk::detail::throw_on_error(cudaMemcpy(raw_ptr, &x, sizeof(T), cudaMemcpyHostToDevice),
-                               "make_parameter(): after cudaMemcpy");
-#else
-  std::memcpy(raw_ptr, &x, sizeof(T));
-#endif
-
-  return parameter_ptr<T>(raw_ptr);
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp
deleted file mode 100644
index bed1cbf11..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-
-// #include this for device_properties_t and function_attributes_t
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/cuda_launch_config.hpp>
-
-// #include this for size_t
-#include <cstddef>
-
-
-// runtime introspection isn't possible without CUDART
-#if __BULK_HAS_CUDART__
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-/*! Returns the current device ordinal.
- */
-__host__ __device__
-inline int current_device();
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with a given device.
- */
-__host__ __device__
-inline device_properties_t device_properties(int device_id);
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with the current device.
- */
-__host__ __device__
-inline device_properties_t device_properties();
-
-/*! Returns a copy of the function_attributes_t structure
- *  that is associated with a given __global__ function
- */
-template <typename KernelFunction>
-__host__ __device__
-inline function_attributes_t function_attributes(KernelFunction kernel);
-
-/*! Returns the compute capability of a device in integer format.
- *  For example, returns 10 for sm_10 and 21 for sm_21
- *  \return The compute capability as an integer
- */
-__host__ __device__
-inline size_t compute_capability(const device_properties_t &properties);
-
-__host__ __device__
-inline size_t compute_capability();
-
-
-} // end namespace detail
-} // end namespace bulk
-BULK_NAMESPACE_SUFFIX
-
-
-#endif // __BULK_HAS_CUDART__
-
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl>
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl
deleted file mode 100644
index 93f52ab28..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.inl
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-__host__ __device__
-inline device_properties_t device_properties_uncached(int device_id)
-{
-  device_properties_t prop = {0,{0,0,0},0,0,0,0,0,0,0};
-
-  cudaError_t error = cudaErrorNoDevice;
-
-#if __BULK_HAS_CUDART__
-  error = cudaDeviceGetAttribute(&prop.major,           cudaDevAttrComputeCapabilityMajor,      device_id);
-  error = cudaDeviceGetAttribute(&prop.maxGridSize[0],              cudaDevAttrMaxGridDimX,                 device_id);
-  error = cudaDeviceGetAttribute(&prop.maxGridSize[1],              cudaDevAttrMaxGridDimY,                 device_id);
-  error = cudaDeviceGetAttribute(&prop.maxGridSize[2],              cudaDevAttrMaxGridDimZ,                 device_id);
-  error = cudaDeviceGetAttribute(&prop.maxThreadsPerBlock,          cudaDevAttrMaxThreadsPerBlock,          device_id);
-  error = cudaDeviceGetAttribute(&prop.maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
-  error = cudaDeviceGetAttribute(&prop.minor,                       cudaDevAttrComputeCapabilityMinor,      device_id);
-  error = cudaDeviceGetAttribute(&prop.multiProcessorCount,         cudaDevAttrMultiProcessorCount,         device_id);
-  error = cudaDeviceGetAttribute(&prop.regsPerBlock,                cudaDevAttrMaxRegistersPerBlock,        device_id);
-  int temp;
-  error = cudaDeviceGetAttribute(&temp,                             cudaDevAttrMaxSharedMemoryPerBlock,     device_id);
-  prop.sharedMemPerBlock = temp;
-  error = cudaDeviceGetAttribute(&prop.warpSize,                    cudaDevAttrWarpSize,                    device_id);
-#else
-  (void) device_id; // Suppress unused parameter warnings
-#endif
-
-  throw_on_error(error, "cudaDeviceGetProperty in get_device_properties");
-
-  return prop;
-}
-
-
-inline device_properties_t device_properties_cached(int device_id)
-{
-  // cache the result of get_device_properties, because it is slow
-  // only cache the first few devices
-  static const int max_num_devices                              = 16;
-
-  static bool properties_exist[max_num_devices]                 = {0};
-  static device_properties_t device_properties[max_num_devices] = {};
-
-  if(device_id >= max_num_devices)
-  {
-    return device_properties_uncached(device_id);
-  }
-
-  if(!properties_exist[device_id])
-  {
-    device_properties[device_id] = device_properties_uncached(device_id);
-
-    // disallow the compiler to move the write to properties_exist[device_id]
-    // before the initialization of device_properties[device_id]
-    __thrust_compiler_fence();
-    
-    properties_exist[device_id] = true;
-  }
-
-  return device_properties[device_id];
-}
-
-
-__host__ __device__
-inline device_properties_t device_properties(int device_id)
-{
-#ifndef __CUDA_ARCH__
-  return device_properties_cached(device_id);
-#else
-  return device_properties_uncached(device_id);
-#endif
-}
-
-
-__host__ __device__
-inline int current_device()
-{
-  int result = -1;
-
-#if __BULK_HAS_CUDART__
-  bulk::detail::throw_on_error(cudaGetDevice(&result), "current_device(): after cudaGetDevice");
-#endif
-
-  if(result < 0)
-  {
-    bulk::detail::throw_on_error(cudaErrorNoDevice, "current_device(): after cudaGetDevice"); 
-  }
-
-  return result;
-}
-
-
-__host__ __device__
-inline device_properties_t device_properties()
-{
-  return device_properties(current_device());
-}
-
-
-template <typename KernelFunction>
-__host__ __device__
-inline function_attributes_t function_attributes(KernelFunction kernel)
-{
-#if __BULK_HAS_CUDART__
-  typedef void (*fun_ptr_type)();
-
-  fun_ptr_type fun_ptr = reinterpret_cast<fun_ptr_type>(kernel);
-
-  cudaFuncAttributes attributes;
-  
-  bulk::detail::throw_on_error(cudaFuncGetAttributes(&attributes, fun_ptr), "function_attributes(): after cudaFuncGetAttributes");
-
-  // be careful about how this is initialized!
-  function_attributes_t result = {
-    attributes.constSizeBytes,
-    attributes.localSizeBytes,
-    attributes.maxThreadsPerBlock,
-    attributes.numRegs,
-    attributes.ptxVersion,
-    attributes.sharedSizeBytes
-  };
-
-  return result;
-#else
-  return function_attributes_t();
-#endif // __CUDACC__
-}
-
-__host__ __device__
-inline size_t compute_capability(const device_properties_t &properties)
-{
-  return 10 * properties.major + properties.minor;
-}
-
-
-__host__ __device__
-inline size_t compute_capability()
-{
-  return compute_capability(device_properties());
-}
-
-
-} // end namespace detail
-} // end namespace bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp
deleted file mode 100644
index 5c72a5693..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_launcher/triple_chevron_launcher.hpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/alignment.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/parameter_ptr.hpp>
-
-// It's not possible to launch a CUDA kernel unless __BULK_HAS_CUDART__
-// is 1, so we'd like to just hide all this code when that macro is 0.
-// Unfortunately, we can't actually modulate kernel launches based on that macro
-// because that will hide __global__ function template instantiations from critical
-// nvcc compilation phases. This means that nvcc won't actually place the kernel in the
-// binary and we'll get an undefined __global__ function error at runtime.
-// So we allow the user to unconditionally call cuda_launcher.launch() even though it
-// will terminate the program at runtime if CUDART is not available.
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-#ifdef __CUDACC__
-// if there are multiple versions of Bulk floating around, this may be #defined already
-#  ifndef __bulk_launch_bounds__
-#    define __bulk_launch_bounds__(num_threads_per_block, num_blocks_per_sm) __launch_bounds__(num_threads_per_block, num_blocks_per_sm)
-#  endif
-#else
-#  ifndef __bulk_launch_bounds__
-#    define __bulk_launch_bounds__(num_threads_per_block, num_blocks_per_sm)
-#  endif
-#endif // __CUDACC__
-
-
-// triple_chevron_launcher_base is the base class of triple_chevron_launcher
-// it primarily serves to choose (statically) which __global__ function is used as the kernel
-// sm_20+ devices have 4096 bytes of parameter space
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
-template<unsigned int block_size, typename Function, bool by_value = (sizeof(Function) <= 4096)> struct triple_chevron_launcher_base;
-
-
-template<unsigned int block_size, typename Function>
-__global__
-__bulk_launch_bounds__(block_size, 0)
-void launch_by_value(Function f)
-{
-  f();
-}
-
-
-template<unsigned int block_size, typename Function>
-struct triple_chevron_launcher_base<block_size,Function,true>
-{
-  typedef void (*global_function_pointer_t)(Function);
-
-  __host__ __device__
-  static global_function_pointer_t global_function_pointer()
-  {
-    return launch_by_value<block_size,Function>;
-  }
-};
-
-
-template<unsigned int block_size, typename Function>
-__global__
-__bulk_launch_bounds__(block_size, 0)
-void launch_by_pointer(const Function *f)
-{
-  // copy to registers
-  Function f_reg = *f;
-  f_reg();
-}
-
-
-template<unsigned int block_size, typename Function>
-struct triple_chevron_launcher_base<block_size,Function,false>
-{
-  typedef void (*global_function_pointer_t)(const Function*);
-
-  __host__ __device__
-  static global_function_pointer_t global_function_pointer()
-  {
-    return launch_by_pointer<block_size,Function>;
-  }
-};
-
-
-// sm_20+ devices have 4096 bytes of parameter space
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
-template<unsigned int block_size_, typename Function, bool by_value = sizeof(Function) <= 4096>
-class triple_chevron_launcher : protected triple_chevron_launcher_base<block_size_, Function>
-{
-  private:
-    typedef triple_chevron_launcher_base<block_size_,Function> super_t;
-
-  public:
-    typedef Function task_type;
-
-    inline __host__ __device__
-    void launch(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
-    {
-      struct workaround
-      {
-        __host__ __device__
-        static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
-        {
-#if __BULK_HAS_CUDART__
-#  ifndef __CUDA_ARCH__
-          cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream);
-          cudaSetupArgument(task, 0);
-          bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()");
-#  else
-          void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type));
-          std::memcpy(param_buffer, &task, sizeof(task_type));
-          bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream),
-                                       "after cudaLaunchDevice in triple_chevron_launcher::launch()");
-#  endif // __CUDA_ARCH__
-#endif // __BULK_HAS_CUDART__
-        }
-
-        __host__ __device__
-        static void unsupported_path(unsigned int, unsigned int, size_t, cudaStream_t, task_type)
-        {
-          bulk::detail::terminate_with_message("triple_chevron_launcher::launch(): CUDA kernel launch requires CUDART.");
-        }
-      };
-
-#if __BULK_HAS_CUDART__
-      workaround::supported_path(num_blocks, block_size, num_dynamic_smem_bytes, stream, task);
-#else
-      workaround::unsupported_path(num_blocks, block_size, num_dynamic_smem_bytes, stream, task);
-#endif
-    } // end launch()
-};
-
-
-// sm_20+ devices have 4096 bytes of parameter space
-// http://docs.nvidia.com/cuda/cuda-c-programming-guide/#function-parameters
-// This specialization of triple_chevron_launcher marshals large Functions through
-// global memory via parameter_ptr
-template<unsigned int block_size_, typename Function>
-class triple_chevron_launcher<block_size_,Function,false> : protected triple_chevron_launcher_base<block_size_,Function>
-{
-  private:
-    typedef triple_chevron_launcher_base<block_size_,Function> super_t;
-
-  public:
-    typedef Function task_type;
-
-    inline __host__ __device__
-    void launch(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
-    {
-      struct workaround
-      {
-        __host__ __device__
-        static void supported_path(unsigned int num_blocks, unsigned int block_size, size_t num_dynamic_smem_bytes, cudaStream_t stream, task_type task)
-        {
-          bulk::detail::parameter_ptr<task_type> parm = bulk::detail::make_parameter<task_type>(task);
-
-#if __BULK_HAS_CUDART__
-#  ifndef __CUDA_ARCH__
-          cudaConfigureCall(dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream);
-          cudaSetupArgument(static_cast<const task_type*>(parm.get()), 0);
-          bulk::detail::throw_on_error(cudaLaunch(super_t::global_function_pointer()), "after cudaLaunch in triple_chevron_launcher::launch()");
-#  else
-          void *param_buffer = cudaGetParameterBuffer(alignment_of<task_type>::value, sizeof(task_type));
-          task_type *task_ptr = parm.get();
-          std::memcpy(param_buffer, &task_ptr, sizeof(task_type*));
-          bulk::detail::throw_on_error(cudaLaunchDevice(reinterpret_cast<void*>(super_t::global_function_pointer()), param_buffer, dim3(num_blocks), dim3(block_size), num_dynamic_smem_bytes, stream),
-                                       "after cudaLaunchDevice in triple_chevron_launcher::launch()");
-#  endif // __CUDA_ARCH__
-#endif // __BULK_HAS_CUDART__
-        }
-
-        __host__ __device__
-        static void unsupported_path(unsigned int, unsigned int, size_t, cudaStream_t, task_type)
-        {
-          bulk::detail::terminate_with_message("triple_chevron_launcher::launch(): CUDA kernel launch requires CUDART.");
-        }
-      };
-
-#if __BULK_HAS_CUDART__
-      workaround::supported_path(num_blocks, block_size, num_dynamic_smem_bytes, stream, task);
-#else
-      workaround::unsupported_path(num_blocks, block_size, num_dynamic_smem_bytes, stream, task);
-#endif
-    } // end launch()
-};
-
-
-} // end detail
-} // end bul
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp b/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp
deleted file mode 100644
index 9e195aa79..000000000
--- a/thrust/system/cuda/detail/bulk/detail/cuda_task.hpp
+++ /dev/null
@@ -1,368 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/malloc.hpp>
-#include <thrust/system/cuda/detail/bulk/execution_policy.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/closure.hpp>
-
-#include <thrust/detail/type_traits.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename ExecutionGroup, typename Closure>
-class task_base
-{
-  public:
-    typedef ExecutionGroup group_type;
-    typedef Closure        closure_type;
-
-    __host__ __device__
-    task_base(group_type g, closure_type c)
-      : c(c), g(g)
-    {}
-
-  protected:
-    __host__ __device__
-    static void substitute_placeholders_and_execute(group_type &g, closure_type &c)
-    {
-      // substitute placeholders with this_group
-      substituted_arguments_type new_args = substitute_placeholders(g, c.arguments());
-
-      // create a new closure with the new arguments
-      closure<typename closure_type::function_type, substituted_arguments_type> new_c(c.function(), new_args);
-
-      // execute the new closure
-      new_c();
-    }
-
-    closure_type c;
-    group_type g;
-
-  private:
-    template<typename T>
-    struct substitutor_result
-      : thrust::detail::eval_if<
-          bulk::detail::is_cursor<T>::value,
-          cursor_result<T,ExecutionGroup>,
-          thrust::detail::identity_<T>
-        >
-    {};
-
-    typedef typename bulk::detail::tuple_meta_transform<
-      typename closure_type::arguments_type,
-      substitutor_result
-    >::type substituted_arguments_type;
-
-    struct substitutor
-    {
-      group_type &g;
-
-      __device__
-      substitutor(group_type &g)
-        : g(g)
-      {}
-
-      template<unsigned int depth>
-      __device__
-      typename bulk::detail::cursor_result<cursor<depth>,group_type>::type
-      operator()(cursor<depth> c) const
-      {
-        return c.get(g);
-      }
-
-      template<typename T>
-      __device__
-      T &operator()(T &x) const
-      {
-        return x;
-      }
-    };
-
-    __host__ __device__
-    static substituted_arguments_type substitute_placeholders(group_type &g, typename closure_type::arguments_type args)
-    {
-      return bulk::detail::tuple_host_device_transform<substitutor_result>(args, substitutor(g));
-    }
-};
-
-
-template<std::size_t blocksize, std::size_t grainsize>
-struct cuda_block
-{
-  typedef concurrent_group<agent<grainsize>, blocksize> type;
-};
-
-
-template<std::size_t gridsize, std::size_t blocksize, std::size_t grainsize>
-struct cuda_grid
-{
-  typedef parallel_group<
-    typename cuda_block<blocksize,grainsize>::type
-  > type;
-};
-
-
-template<typename Group, typename Closure> class cuda_task;
-
-
-template<typename Grid>
-struct grid_maker
-{
-  __host__ __device__
-  static Grid make(typename Grid::size_type     size,
-                   typename Grid::agent_type    block,
-                   typename Grid::size_type     index)
-  {
-    return Grid(block, index);
-  }
-};
-
-
-template<typename Block>
-struct grid_maker<parallel_group<Block,dynamic_group_size> >
-{
-  __host__ __device__
-  static parallel_group<Block,dynamic_group_size> make(typename parallel_group<Block,dynamic_group_size>::size_type size,
-                                                       Block block,
-                                                       typename parallel_group<Block,dynamic_group_size>::size_type index)
-  {
-    return parallel_group<Block,dynamic_group_size>(size, block, index);
-  }
-};
-
-
-template<typename Block>
-struct block_maker
-{
-  __host__ __device__
-  static Block make(typename Block::size_type     size,
-                    typename Block::size_type     heap_size,
-                    typename Block::agent_type    thread,
-                    typename Block::size_type     index)
-  {
-    return Block(heap_size, thread, index);
-  }
-};
-
-template<typename Thread>
-struct block_maker<concurrent_group<Thread,dynamic_group_size> >
-{
-  __host__ __device__
-  static concurrent_group<Thread,dynamic_group_size> make(typename concurrent_group<Thread,dynamic_group_size>::size_type size,
-                                                          typename concurrent_group<Thread,dynamic_group_size>::size_type heap_size,
-                                                          Thread thread,
-                                                          typename concurrent_group<Thread,dynamic_group_size>::size_type index)
-  {
-    return concurrent_group<Thread,dynamic_group_size>(size, heap_size, thread, index);
-  }
-};
-
-
-template<typename Grid>
-__host__ __device__
-Grid make_grid(typename Grid::size_type size, typename Grid::agent_type block, typename Grid::size_type index = invalid_index)
-{
-  return grid_maker<Grid>::make(size, block, index);
-}
-
-
-template<typename Block>
-__host__ __device__
-Block make_block(typename Block::size_type size, typename Block::size_type heap_size, typename Block::agent_type thread = typename Block::agent_type(), typename Block::size_type index = invalid_index)
-{
-  return block_maker<Block>::make(size, heap_size, thread, index);
-}
-
-
-// specialize cuda_task for a CUDA grid
-template<std::size_t gridsize, std::size_t blocksize, std::size_t grainsize, typename Closure>
-class cuda_task<
-  parallel_group<
-    concurrent_group<
-      agent<grainsize>,
-      blocksize
-    >,
-    gridsize
-  >,
-  Closure
-> : public task_base<typename cuda_grid<gridsize,blocksize,grainsize>::type,Closure>
-{
-  private:
-    typedef task_base<typename cuda_grid<gridsize,blocksize,grainsize>::type,Closure> super_t;
-
-  public:
-    typedef typename super_t::group_type    grid_type;
-    typedef typename grid_type::agent_type  block_type;
-    typedef typename block_type::agent_type thread_type;
-    typedef typename super_t::closure_type  closure_type;
-    typedef typename grid_type::size_type   size_type;
-
-  private:
-    size_type block_offset;
-
-  public:
-
-    __host__ __device__
-    cuda_task(grid_type g, closure_type c, size_type offset)
-      : super_t(g,c),
-        block_offset(offset)
-    {}
-
-    __device__
-    void operator()()
-    {
-      // guard use of CUDA built-ins from foreign compilers
-#ifdef __CUDA_ARCH__
-      // instantiate a view of this grid
-      grid_type this_grid =
-        make_grid<grid_type>(
-          super_t::g.size(),
-          make_block<block_type>(
-            blockDim.x,
-            super_t::g.this_exec.heap_size(),
-            thread_type(threadIdx.x),
-            block_offset + blockIdx.x
-          ),
-          0
-      );
-
-#if __CUDA_ARCH__ >= 200
-      // initialize shared storage
-      if(this_grid.this_exec.this_exec.index() == 0)
-      {
-        bulk::detail::init_on_chip_malloc(this_grid.this_exec.heap_size());
-      }
-      this_grid.this_exec.wait();
-#endif
-
-      substitute_placeholders_and_execute(this_grid, super_t::c);
-#endif
-    } // end operator()
-}; // end cuda_task
-
-
-// specialize cuda_task for a single CUDA block
-template<std::size_t blocksize, std::size_t grainsize, typename Closure>
-class cuda_task<
-  concurrent_group<
-    agent<grainsize>,
-    blocksize
-  >,
-  Closure
-> : public task_base<typename cuda_block<blocksize,grainsize>::type,Closure>
-{
-  private:
-    typedef task_base<typename cuda_block<blocksize,grainsize>::type,Closure> super_t;
-
-  public:
-    typedef typename super_t::group_type    block_type;
-    typedef typename block_type::agent_type thread_type;
-    typedef typename super_t::closure_type  closure_type;
-    typedef typename block_type::size_type  size_type;
-
-  public:
-    __host__ __device__
-    cuda_task(block_type b, closure_type c)
-      : super_t(b,c)
-    {}
-
-    __device__
-    void operator()()
-    {
-      // guard use of CUDA built-ins from foreign compilers
-#ifdef __CUDA_ARCH__
-      // instantiate a view of this block
-      block_type this_block =
-        make_block<block_type>(
-          blockDim.x,
-          super_t::g.heap_size(),
-          thread_type(threadIdx.x),
-          0
-        );
-
-#if __CUDA_ARCH__ >= 200
-      // initialize shared storage
-      if(this_block.this_exec.index() == 0)
-      {
-        bulk::detail::init_on_chip_malloc(this_block.heap_size());
-      }
-      this_block.wait();
-#endif
-
-      substitute_placeholders_and_execute(this_block, super_t::c);
-#endif
-    } // end operator()
-}; // end cuda_task
-
-
-// specialize cuda_task for a single big parallel group
-template<std::size_t groupsize, std::size_t grainsize, typename Closure>
-class cuda_task<parallel_group<agent<grainsize>,groupsize>,Closure>
-  : public task_base<parallel_group<agent<grainsize>,groupsize>,Closure>
-{
-  private:
-    typedef task_base<parallel_group<agent<grainsize>,groupsize>,Closure> super_t;
-
-  public:
-    typedef typename super_t::closure_type closure_type;
-    typedef typename super_t::group_type   group_type;
-
-    __host__ __device__
-    cuda_task(group_type g, closure_type c)
-      : super_t(g,c)
-    {}
-
-    __device__
-    void operator()()
-    {
-      // guard use of CUDA built-ins from foreign compilers
-#ifdef __CUDA_ARCH__
-      typedef int size_type;
-
-      const size_type grid_size = gridDim.x * blockDim.x;
-
-      for(size_type tid = blockDim.x * blockIdx.x + threadIdx.x;
-          tid < super_t::g.size();
-          tid += grid_size)
-      {
-        // instantiate a view of the exec group
-        parallel_group<agent<grainsize>,groupsize> this_group(
-          1,
-          agent<grainsize>(tid),
-          0
-        );
-
-        substitute_placeholders_and_execute(this_group, super_t::c);
-      } // end for
-#endif
-    } // end operator()
-}; // end cuda_task
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp b/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp
deleted file mode 100644
index 85c94b8b3..000000000
--- a/thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-
-// the purpose of this header is to #include <cuda_runtime_api> without causing
-// warnings from redefinitions of __host__ and __device__.
-// we only do this if host_defines.h has not been included yet
-// we carefully save the definitions of __host__ & __device__ and restore them
-// if the compiler does not have push_macro & pop_macro, just undef __host__ & __device__ and hope for the best
-
-// can't tell exactly when push_macro & pop_macro were introduced to gcc; assume 4.5.0
-#if !defined(__HOST_DEFINES_H__)
-#  if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500) || defined(__clang__)
-#    ifdef __host__
-#      pragma push_macro("__host__")
-#      undef __host__
-#      define BULK_HOST_NEEDS_RESTORATION
-#    endif
-#    ifdef __device__
-#      pragma push_macro("__device__")
-#      undef __device__
-#      define BULK_DEVICE_NEEDS_RESTORATION
-#    endif
-#  else // GNUC pre 4.5.0
-#    ifdef __host__
-#      undef __host__
-#    endif
-#    ifdef __device__
-#      undef __device__
-#    endif
-#  endif // has push/pop_macro
-#endif // __HOST_DEFINES_H__
-
-
-#include <cuda_runtime_api.h>
-
-
-#if !defined(__GNUC__) || ((10000 * __GNUC__ + 100 * __GNUC_MINOR__ + __GNUC_PATCHLEVEL__) >= 40500) || defined(__clang__)
-#  ifdef BULK_HOST_NEEDS_RESTORATION
-#    pragma pop_macro("__host__")
-#    undef BULK_HOST_NEEDS_RESTORATION
-#  endif
-#  ifdef BULK_DEVICE_NEEDS_RESTORATION
-#    pragma pop_macro("__device__")
-#    undef BULK_DEVICE_NEEDS_RESTORATION
-#  endif
-#endif // __GNUC__
-
diff --git a/thrust/system/cuda/detail/bulk/detail/head_flags.hpp b/thrust/system/cuda/detail/bulk/detail/head_flags.hpp
deleted file mode 100644
index e35a3ea63..000000000
--- a/thrust/system/cuda/detail/bulk/detail/head_flags.hpp
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tuple.h>
-#include <thrust/functional.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename RandomAccessIterator,
-         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
-         typename ValueType = bool,
-         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
-  class head_flags_with_init
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type init_type;
-
-  // XXX WAR cudafe issue
-  //private:
-  public:
-    struct head_flag_functor
-    {
-      BinaryPredicate binary_pred; // this must be the first member for performance reasons
-      init_type init;
-      IndexType n;
-
-      typedef ValueType result_type;
-
-      __host__ __device__
-      head_flag_functor(init_type init, IndexType n)
-        : binary_pred(), init(init), n(n)
-      {}
-
-      __host__ __device__
-      head_flag_functor(init_type init, IndexType n, BinaryPredicate binary_pred)
-        : binary_pred(binary_pred), init(init), n(n)
-      {}
-
-      template<typename Tuple>
-      __host__ __device__ __thrust_forceinline__
-      result_type operator()(const Tuple &t)
-      {
-        const IndexType i = thrust::get<0>(t);
-
-        if(i == 0)
-        {
-          return !binary_pred(init, thrust::get<1>(t));
-        }
-
-        return !binary_pred(thrust::get<1>(t), thrust::get<2>(t));
-      }
-    };
-
-    typedef thrust::counting_iterator<IndexType> counting_iterator;
-
-  public:
-    typedef thrust::transform_iterator<
-      head_flag_functor,
-      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
-    > iterator;
-
-    __bulk_hd_warning_disable__
-    __host__ __device__
-    head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
-                                                head_flag_functor(init, last - first))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __host__ __device__
-    head_flags_with_init(RandomAccessIterator first, RandomAccessIterator last, init_type init, BinaryPredicate binary_pred)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
-                                                head_flag_functor(init, last - first, binary_pred))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __host__ __device__
-    iterator begin() const
-    {
-      return m_begin;
-    }
-
-    __host__ __device__
-    iterator end() const
-    {
-      return m_end;
-    }
-
-    template<typename OtherIndex>
-    __host__ __device__
-    typename iterator::reference operator[](OtherIndex i)
-    {
-      return *(begin() + i);
-    }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-
-template<typename RandomAccessIterator,
-         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
-         typename ValueType = bool,
-         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
-//  class head_flags
-  class head_flags_
-{
-  // XXX WAR cudafe issue
-  //private:
-  public:
-    struct head_flag_functor
-    {
-      BinaryPredicate binary_pred; // this must be the first member for performance reasons
-      IndexType n;
-
-      typedef ValueType result_type;
-
-      __host__ __device__
-      head_flag_functor(IndexType n)
-        : binary_pred(), n(n)
-      {}
-
-      __host__ __device__
-      head_flag_functor(IndexType n, BinaryPredicate binary_pred)
-        : binary_pred(binary_pred), n(n)
-      {}
-
-      template<typename Tuple>
-      __host__ __device__ __thrust_forceinline__
-      result_type operator()(const Tuple &t)
-      {
-        const IndexType i = thrust::get<0>(t);
-
-        // note that we do not dereference the tuple's 2nd element when i <= 0
-        // and therefore do not dereference a bad location at the boundary
-        return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
-      }
-    };
-
-    typedef thrust::counting_iterator<IndexType> counting_iterator;
-
-  public:
-    typedef thrust::transform_iterator<
-      head_flag_functor,
-      thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
-    > iterator;
-
-    __host__ __device__
-    //head_flags(RandomAccessIterator first, RandomAccessIterator last)
-    head_flags_(RandomAccessIterator first, RandomAccessIterator last)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
-                                                head_flag_functor(last - first))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __host__ __device__
-    //head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-    head_flags_(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-      : m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
-                                                head_flag_functor(last - first, binary_pred))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __host__ __device__
-    iterator begin() const
-    {
-      return m_begin;
-    }
-
-    __host__ __device__
-    iterator end() const
-    {
-      return m_end;
-    }
-
-    template<typename OtherIndex>
-    __host__ __device__
-    typename iterator::reference operator[](OtherIndex i)
-    {
-      return *(begin() + i);
-    }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-template<typename RandomAccessIterator, typename BinaryPredicate>
-__host__ __device__
-//head_flags_<RandomAccessIterator, BinaryPredicate>
-head_flags_<RandomAccessIterator, BinaryPredicate>
-  make_head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-{
-  //return head_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
-  return head_flags_<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
-}
-
-
-template<typename RandomAccessIterator>
-__host__ __device__
-//head_flags<RandomAccessIterator>
-head_flags_<RandomAccessIterator>
-  make_head_flags(RandomAccessIterator first, RandomAccessIterator last)
-{
-  //return head_flags<RandomAccessIterator>(first, last);
-  return head_flags_<RandomAccessIterator>(first, last);
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp b/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp
deleted file mode 100644
index 54a3bc01c..000000000
--- a/thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-inline __device__ unsigned int __isShared(const void *ptr)
-{
-  // XXX WAR unused variable warning
-  (void) ptr;
-
-  unsigned int ret;
-
-#if __CUDA_ARCH__ >= 200
-  asm volatile ("{ \n\t"
-                "    .reg .pred p; \n\t"
-                "    isspacep.shared p, %1; \n\t"
-                "    selp.u32 %0, 1, 0, p;  \n\t"
-#  if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__)
-                "} \n\t" : "=r"(ret) : "l"(ptr));
-#  else
-                "} \n\t" : "=r"(ret) : "r"(ptr));
-#  endif
-#else
-  ret = 0;
-#endif
-
-  return ret;
-} // end __isShared()
-
-
-inline __device__ bool is_shared(const void *ptr)
-{
-  return __isShared(ptr);
-} // end is_shared()
-
-
-inline __device__ bool is_global(const void *ptr)
-{
-  // XXX WAR unused variable warning
-  (void) ptr;
-
-#if __CUDA_ARCH__ >= 200
-  return __isGlobal(ptr);
-#else
-  return false;
-#endif
-} // end is_global()
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/synchronize.hpp b/thrust/system/cuda/detail/bulk/detail/synchronize.hpp
deleted file mode 100644
index f8c38f7bc..000000000
--- a/thrust/system/cuda/detail/bulk/detail/synchronize.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/terminate.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-inline __host__ __device__
-void synchronize(const char* message = "")
-{
-#if __BULK_HAS_CUDART__
-  bulk::detail::throw_on_error(cudaDeviceSynchronize(), message);
-#else
-  bulk::detail::terminate_with_message("cudaDeviceSynchronize() requires CUDART");
-  (void)message; // Avoid unused parameter warnings
-#endif
-} // end terminate()
-
-
-inline __host__ __device__
-void synchronize_if_enabled(const char* message = "")
-{
-// XXX we rely on __THRUST_SYNCHRONOUS here
-//     note we always have to synchronize in __device__ code
-#if __THRUST_SYNCHRONOUS || defined(__CUDA_ARCH__)
-  synchronize(message);
-#else
-  // WAR "unused parameter" warning
-  (void) message;
-#endif
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp b/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp
deleted file mode 100644
index 6a21204bc..000000000
--- a/thrust/system/cuda/detail/bulk/detail/tail_flags.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tuple.h>
-#include <thrust/functional.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename RandomAccessIterator,
-         typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
-         typename ValueType = bool,
-         typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
-  class tail_flags_
-{
-  // XXX WAR cudafe bug
-  //private:
-  public:
-    struct tail_flag_functor
-    {
-      BinaryPredicate binary_pred; // this must be the first member for performance reasons
-      RandomAccessIterator iter;
-      IndexType n;
-
-      typedef ValueType result_type;
-
-      __host__ __device__
-      tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last)
-        : binary_pred(), iter(first), n(last - first)
-      {}
-
-      __host__ __device__
-      tail_flag_functor(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-        : binary_pred(binary_pred), iter(first), n(last - first)
-      {}
-
-      __host__ __device__ __thrust_forceinline__
-      result_type operator()(const IndexType &i)
-      {
-        return (i == (n - 1) || !binary_pred(iter[i], iter[i+1]));
-      }
-    };
-
-    typedef thrust::counting_iterator<IndexType> counting_iterator;
-
-  public:
-    typedef thrust::transform_iterator<
-      tail_flag_functor,
-      counting_iterator
-    > iterator;
-
-    __thrust_exec_check_disable__
-    __host__ __device__
-    tail_flags_(RandomAccessIterator first, RandomAccessIterator last)
-      : m_begin(thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0),
-                                                tail_flag_functor(first, last))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __thrust_exec_check_disable__
-    __host__ __device__
-    tail_flags_(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-      : m_begin(thrust::make_transform_iterator(thrust::counting_iterator<IndexType>(0),
-                                                tail_flag_functor(first, last, binary_pred))),
-        m_end(m_begin + (last - first))
-    {}
-
-    __host__ __device__
-    iterator begin() const
-    {
-      return m_begin;
-    }
-
-    __host__ __device__
-    iterator end() const
-    {
-      return m_end;
-    }
-
-    template<typename OtherIndex>
-    __host__ __device__
-    typename iterator::reference operator[](OtherIndex i)
-    {
-      return *(begin() + i);
-    }
-
-  private:
-    iterator m_begin, m_end;
-};
-
-
-template<typename RandomAccessIterator, typename BinaryPredicate>
-__host__ __device__
-//tail_flags<RandomAccessIterator, BinaryPredicate>
-tail_flags_<RandomAccessIterator, BinaryPredicate>
-  make_tail_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
-{
-//  return tail_flags<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
-  return tail_flags_<RandomAccessIterator, BinaryPredicate>(first, last, binary_pred);
-}
-
-
-template<typename RandomAccessIterator>
-__host__ __device__
-//tail_flags<RandomAccessIterator>
-tail_flags_<RandomAccessIterator>
-  make_tail_flags(RandomAccessIterator first, RandomAccessIterator last)
-{
-//  return tail_flags<RandomAccessIterator>(first, last);
-  return tail_flags_<RandomAccessIterator>(first, last);
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/terminate.hpp b/thrust/system/cuda/detail/bulk/detail/terminate.hpp
deleted file mode 100644
index 33b6578b7..000000000
--- a/thrust/system/cuda/detail/bulk/detail/terminate.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <cstdio>
-#include <exception>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-__host__ __device__
-inline void terminate()
-{
-#ifdef __CUDA_ARCH__
-  asm("trap;");
-#else
-  std::terminate();
-#endif
-} // end terminate()
-
-
-__host__ __device__
-inline void terminate_with_message(const char* message)
-{
-#if __BULK_HAS_PRINTF__
-  std::printf("%s\n", message);
-#endif
-
-  bulk::detail::terminate();
-}
-
-
-__host__ __device__
-inline void terminate_on_error(cudaError_t e, const char* message)
-{
-  if(e)
-  {
-#if (__BULK_HAS_PRINTF__ && __BULK_HAS_CUDART__)
-    printf("Error after: %s: %s\n", message, cudaGetErrorString(e));
-#elif __BULK_HAS_PRINTF__
-    printf("Error: %s\n", message);
-#endif
-    bulk::detail::terminate();
-  }
-}
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp b/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp
deleted file mode 100644
index 56649d775..000000000
--- a/thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/terminate.hpp>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-#include <cstdio>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-inline __host__ __device__
-void throw_on_error(cudaError_t e, const char *message)
-{
-  if(e)
-  {
-#ifndef __CUDA_ARCH__
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-#else
-#  if (__BULK_HAS_PRINTF__ && __BULK_HAS_CUDART__)
-    printf("Error after %s: %s\n", message, cudaGetErrorString(e));
-#  elif __BULK_HAS_PRINTF__
-    printf("Error: %s\n", message);
-#  endif
-    bulk::detail::terminate();
-#endif
-  } // end if
-} // end throw_on_error()
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp b/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp
deleted file mode 100644
index df83c5d9f..000000000
--- a/thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/tuple.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_meta_transform;
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,0>
-{
-  typedef thrust::tuple<> type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,1>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,2>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,3>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,4>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,5>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,6>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,7>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,8>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,9>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type
-  > type;
-};
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction>
-  struct tuple_meta_transform<Tuple,UnaryMetaFunction,10>
-{
-  typedef thrust::tuple<
-    typename UnaryMetaFunction<typename thrust::tuple_element<0,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<1,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<2,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<3,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<4,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<5,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<6,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<7,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<8,Tuple>::type>::type,
-    typename UnaryMetaFunction<typename thrust::tuple_element<9,Tuple>::type>::type
-  > type;
-};
-
-
-} // end detail
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp b/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp
deleted file mode 100644
index b2ad50ee8..000000000
--- a/thrust/system/cuda/detail/bulk/detail/tuple_transform.hpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/tuple_meta_transform.hpp>
-#include <thrust/tuple.h>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction,
-         unsigned int sz = thrust::tuple_size<Tuple>::value>
-  struct tuple_transform_functor;
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,0>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::tuple<>();
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    return thrust::tuple<>();
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,1>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,2>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,3>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,4>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,5>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,6>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,7>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,8>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,9>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)));
-  }
-};
-
-
-template<typename Tuple,
-         template<typename> class UnaryMetaFunction,
-         typename UnaryFunction>
-  struct tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction,10>
-{
-  static __host__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
-  }
-
-  static __host__ __device__
-  typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-  do_it_on_the_host_or_device(const Tuple &t, UnaryFunction f)
-  {
-    typedef typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type XfrmTuple;
-
-    return XfrmTuple(f(thrust::get<0>(t)),
-                     f(thrust::get<1>(t)),
-                     f(thrust::get<2>(t)),
-                     f(thrust::get<3>(t)),
-                     f(thrust::get<4>(t)),
-                     f(thrust::get<5>(t)),
-                     f(thrust::get<6>(t)),
-                     f(thrust::get<7>(t)),
-                     f(thrust::get<8>(t)),
-                     f(thrust::get<9>(t)));
-  }
-};
-
-
-template<template<typename> class UnaryMetaFunction,
-         typename Tuple,
-         typename UnaryFunction>
-typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-tuple_host_transform(const Tuple &t, UnaryFunction f)
-{
-  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host(t,f);
-}
-
-template<template<typename> class UnaryMetaFunction,
-         typename Tuple,
-         typename UnaryFunction>
-typename tuple_meta_transform<Tuple,UnaryMetaFunction>::type
-__host__ __device__
-tuple_host_device_transform(const Tuple &t, UnaryFunction f)
-{
-  return tuple_transform_functor<Tuple,UnaryMetaFunction,UnaryFunction>::do_it_on_the_host_or_device(t,f);
-}
-
-} // end detail
-} // end thrust
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/execution_policy.hpp b/thrust/system/cuda/detail/bulk/execution_policy.hpp
deleted file mode 100644
index af6e708cd..000000000
--- a/thrust/system/cuda/detail/bulk/execution_policy.hpp
+++ /dev/null
@@ -1,680 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/future.hpp>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/cuda/detail/bulk/detail/cuda_launcher/runtime_introspection.hpp>
-#include <cstddef>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-// ExecutionAgent requirements:
-//
-// template<typename T>
-// concept bool ExecutionAgent()
-// {
-//   return requires(T t)
-//   {
-//     typename T::size_type;
-//     {t.index()} -> typename T::size_type;
-//   }
-// };
-//
-// ExecutionGroup requirements:
-//
-// template<typename T>
-// concept bool ExecutionGroup()
-// {
-//   return ExecutionAgent<T>
-//       && requires(T g)
-//   {
-//     typename T::agent_type;
-//     ExecutionAgent<typename T::agent_type>();
-//     {g.size()} -> typename T::size_type;
-//     {g.this_exec} -> typename T::agent_type &
-//   }
-// };
-
-
-static const int invalid_index = INT_MAX;
-
-
-// sequential execution with a grainsize hint and index within a group
-// a light-weight (logical) thread
-template<std::size_t grainsize_ = 1>
-class agent
-{
-  public:
-    typedef int size_type;
-
-    static const size_type static_grainsize = grainsize_;
-
-    __host__ __device__
-    agent(size_type i = invalid_index)
-      : m_index(i)
-    {}
-
-    __host__ __device__
-    size_type index() const
-    {
-      return m_index;
-    }
-
-    __host__ __device__
-    size_type grainsize() const
-    {
-      return static_grainsize;
-    }
-
-  private:
-    const size_type m_index;
-};
-
-
-static const int use_default = INT_MAX;
-
-static const int dynamic_group_size = 0;
-
-
-namespace detail
-{
-namespace group_detail
-{
-
-
-template<typename ExecutionAgent, std::size_t size_>
-class group_base
-{
-  public:
-    typedef ExecutionAgent agent_type;
-
-    typedef int size_type;
-
-    static const size_type static_size = size_;
-
-    __host__ __device__
-    group_base(agent_type exec = agent_type(), size_type i = invalid_index)
-      : this_exec(exec),
-        m_index(i)
-    {}
-
-    __host__ __device__
-    size_type index() const
-    {
-      return m_index;
-    }
-
-    __host__ __device__
-    size_type size() const
-    {
-      return static_size;
-    }
-
-    __device__
-    size_type global_index() const
-    {
-      return index() * size() + this_exec.index();
-    }
-
-    agent_type this_exec;
-
-  private:
-    const size_type m_index;
-};
-
-
-template<typename ExecutionAgent>
-class group_base<ExecutionAgent,dynamic_group_size>
-{
-  public:
-    typedef ExecutionAgent agent_type;
-
-    typedef int size_type;
-
-    __host__ __device__
-    group_base(size_type sz, agent_type exec = agent_type(), size_type i = invalid_index)
-      : this_exec(exec),
-        m_size(sz),
-        m_index(i)
-    {}
-
-    __host__ __device__
-    size_type index() const
-    {
-      return m_index;
-    }
-
-    __host__ __device__
-    size_type size() const
-    {
-      return m_size;
-    }
-
-    __host__ __device__
-    size_type global_index() const
-    {
-      return index() * size() + this_exec.index();
-    }
-
-    agent_type this_exec;
-
-  private:
-    const size_type m_size;
-    const size_type m_index;
-};
-
-
-} // end group_detail
-} // end detail
-
-
-// a group of independent ExecutionAgents
-template<typename ExecutionAgent = agent<>,
-         std::size_t size_ = dynamic_group_size>
-class parallel_group
-  : public detail::group_detail::group_base<ExecutionAgent,size_>
-{
-  private:
-    typedef detail::group_detail::group_base<
-      ExecutionAgent,
-      size_
-    > super_t;
-
-  public:
-    typedef typename super_t::agent_type agent_type;
-
-    typedef typename super_t::size_type  size_type;
-
-    // XXX the constructor taking an index should be made private
-    __host__ __device__
-    parallel_group(agent_type exec = agent_type(), size_type i = invalid_index)
-      : super_t(exec,i)
-    {}
-};
-
-
-template<typename ExecutionAgent>
-class parallel_group<ExecutionAgent,dynamic_group_size>
-  : public detail::group_detail::group_base<ExecutionAgent,dynamic_group_size>
-{
-  private:
-    typedef detail::group_detail::group_base<
-      ExecutionAgent,
-      dynamic_group_size
-    > super_t;
-
-  public:
-    typedef typename super_t::agent_type agent_type;
-
-    typedef typename super_t::size_type  size_type;
-
-    // XXX the constructor taking an index should be made private
-    __host__ __device__
-    parallel_group(size_type size, agent_type exec = agent_type(), size_type i = invalid_index)
-      : super_t(size,exec,i)
-    {}
-};
-
-
-// shorthand for creating a parallel_group of agents
-inline __host__ __device__
-parallel_group<> par(size_t size)
-{
-  typedef parallel_group<>::size_type size_type;
-  return parallel_group<>(static_cast<size_type>(size));
-}
-
-
-// shorthand for creating a parallel_group of ExecutionAgents
-template<typename ExecutionAgent>
-__host__ __device__
-parallel_group<ExecutionAgent> par(ExecutionAgent exec, size_t size)
-{
-  typedef typename parallel_group<ExecutionAgent>::size_type size_type;
-  return parallel_group<ExecutionAgent>(static_cast<size_type>(size), exec);
-}
-
-
-template<typename ExecutionAgent>
-class async_launch
-{
-  public:
-    __host__ __device__
-    async_launch(ExecutionAgent exec, cudaStream_t s, cudaEvent_t be = 0)
-      : stream_valid(true),e(exec),s(s),be(be)
-    {}
-
-    __host__
-    async_launch(ExecutionAgent exec, cudaEvent_t be)
-      : stream_valid(false),e(exec),s(0),be(be)
-    {}
-
-    __host__ __device__
-    ExecutionAgent exec() const
-    {
-      return e;
-    }
-
-    __host__ __device__
-    cudaStream_t stream() const
-    {
-      return s;
-    }
-
-    __host__ __device__
-    cudaEvent_t before_event() const
-    {
-      return be;
-    }
-
-    __host__ __device__
-    bool is_stream_valid() const
-    {
-      return stream_valid;
-    }
-
-  private:
-    bool stream_valid;
-    ExecutionAgent e;
-    cudaStream_t s;
-    cudaEvent_t be;
-};
-
-
-inline __host__ __device__
-async_launch<bulk::parallel_group<> > par(cudaStream_t s, size_t num_threads)
-{
-  typedef bulk::parallel_group<>::size_type size_type;
-  return async_launch<bulk::parallel_group<> >(bulk::parallel_group<>(static_cast<size_type>(num_threads)), s);
-}
-
-
-template<typename ExecutionAgent>
-inline __host__ __device__
-async_launch<bulk::parallel_group<ExecutionAgent> > par(cudaStream_t s, ExecutionAgent exec, size_t num_groups)
-{
-  return async_launch<bulk::parallel_group<ExecutionAgent> >(bulk::par(exec, num_groups), s);
-}
-
-
-inline async_launch<bulk::parallel_group<> > par(bulk::future<void> &before, size_t num_threads)
-{
-  cudaEvent_t before_event = bulk::detail::future_core_access::event(before);
-
-  typedef bulk::parallel_group<>::size_type size_type;
-  return async_launch<bulk::parallel_group<> >(bulk::parallel_group<>(static_cast<size_type>(num_threads)), before_event);
-}
-
-
-// a group of concurrent ExecutionAgents which may synchronize
-template<typename ExecutionAgent      = agent<>,
-         std::size_t size_      = dynamic_group_size>
-class concurrent_group
-  : public parallel_group<ExecutionAgent,size_>
-{
-  private:
-    typedef parallel_group<
-      ExecutionAgent,
-      size_
-    > super_t;
-
-  public:
-    typedef typename super_t::agent_type agent_type;
-    typedef typename super_t::size_type  size_type;
-
-    // XXX the constructor taking an index should be made private
-    __host__ __device__
-    concurrent_group(size_type heap_size = use_default,
-                     agent_type exec = agent_type(),
-                     size_type i = invalid_index)
-      : super_t(exec,i),
-        m_heap_size(heap_size)
-    {}
-
-    __device__
-    void wait() const
-    {
-      // guard use of __syncthreads from foreign compilers
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-    }
-
-    __host__ __device__
-    size_type heap_size() const
-    {
-      return m_heap_size;
-    }
-
-    // XXX this should go elsewhere
-    __host__ __device__
-    inline static size_type hardware_concurrency()
-    {
-#if __BULK_HAS_CUDART__
-      return static_cast<size_type>(bulk::detail::device_properties().multiProcessorCount);
-#else
-      return 0;
-#endif
-    } // end hardware_concurrency()
-
-  private:
-    size_type m_heap_size;
-};
-
-
-template<typename ExecutionAgent>
-class concurrent_group<ExecutionAgent,dynamic_group_size>
-  : public parallel_group<ExecutionAgent,dynamic_group_size>
-{
-  private:
-    typedef parallel_group<
-      ExecutionAgent,
-      dynamic_group_size
-    > super_t;
-
-  public:
-    typedef typename super_t::agent_type agent_type;
-
-    typedef typename super_t::size_type  size_type;
-
-    // XXX the constructor taking an index should be made private
-    __host__ __device__
-    concurrent_group(size_type size,
-                     size_type heap_size = use_default,
-                     agent_type exec = agent_type(),
-                     size_type i = invalid_index)
-      : super_t(size,exec,i),
-        m_heap_size(heap_size)
-    {}
-
-    __device__
-    void wait()
-    {
-      // guard use of __syncthreads from foreign compilers
-#ifdef __CUDA_ARCH__
-      __syncthreads();
-#endif
-    }
-
-    __host__ __device__
-    size_type heap_size() const
-    {
-      return m_heap_size;
-    }
-
-    // XXX this should go elsewhere
-    __host__ __device__
-    inline static size_type hardware_concurrency()
-    {
-#if __BULK_HAS_CUDART__
-      return static_cast<size_type>(bulk::detail::device_properties().multiProcessorCount);
-#else
-      return 0;
-#endif
-    } // end hardware_concurrency()
-
-  private:
-    size_type m_heap_size;
-};
-
-
-// shorthand for creating a concurrent_group of agents
-inline __host__ __device__
-concurrent_group<> con(size_t size, size_t heap_size = use_default)
-{
-  typedef concurrent_group<>::size_type size_type;
-  return concurrent_group<>(static_cast<size_type>(size),static_cast<size_type>(heap_size));
-}
-
-
-// shorthand for creating a concurrent_group of ExecutionAgents
-template<typename ExecutionAgent>
-__host__ __device__
-concurrent_group<ExecutionAgent> con(ExecutionAgent exec, size_t size, size_t heap_size = use_default)
-{
-  typedef typename concurrent_group<ExecutionAgent>::size_type size_type;
-  return concurrent_group<ExecutionAgent>(static_cast<size_type>(size),static_cast<size_type>(heap_size),exec);
-}
-
-
-// shorthand for creating a concurrent_group of agents with static sizing
-template<std::size_t groupsize, std::size_t grainsize>
-__host__ __device__
-concurrent_group<bulk::agent<grainsize>,groupsize>
-con(size_t heap_size)
-{
-  typedef typename concurrent_group<bulk::agent<grainsize>,groupsize>::size_type size_type;
-  return concurrent_group<bulk::agent<grainsize>,groupsize>(static_cast<size_type>(heap_size));
-}
-
-
-// a way to statically bound the size of an ExecutionAgent's work
-template<std::size_t bound_, typename ExecutionAgent>
-class bounded
-  : public ExecutionAgent
-{
-  public:
-    typedef typename ExecutionAgent::size_type size_type;
-
-    static const size_type static_bound = bound_;
-
-    __host__ __device__
-    size_type bound() const
-    {
-      return static_bound;
-    }
-
-
-    __host__ __device__
-    ExecutionAgent &unbound()
-    {
-      return *this;
-    }
-
-
-    __host__ __device__
-    const ExecutionAgent &unbound() const
-    {
-      return *this;
-    }
-
-
-  private:
-    // XXX delete these unless we find a need for them
-    bounded();
-
-    bounded(const bounded &);
-};
-
-
-template<std::size_t bound_, typename ExecutionAgent>
-__host__ __device__
-bounded<bound_, ExecutionAgent> &bound(ExecutionAgent &exec)
-{
-  return static_cast<bounded<bound_, ExecutionAgent>&>(exec);
-}
-
-
-template<std::size_t bound_, typename ExecutionAgent>
-__host__ __device__
-const bounded<bound_, ExecutionAgent> &bound(const ExecutionAgent &exec)
-{
-  return static_cast<const bounded<bound_, ExecutionAgent>&>(exec);
-}
-
-
-namespace detail
-{
-
-
-template<unsigned int depth, typename ExecutionAgent>
-struct agent_at_depth
-{
-  typedef typename agent_at_depth<
-    depth-1,ExecutionAgent
-  >::type parent_agent_type;
-
-  typedef typename parent_agent_type::agent_type type;
-};
-
-
-template<typename ExecutionAgent>
-struct agent_at_depth<0,ExecutionAgent>
-{
-  typedef ExecutionAgent type;
-};
-
-
-template<typename Cursor, typename ExecutionGroup>
-struct cursor_result
-{
-  typedef typename agent_at_depth<Cursor::depth,ExecutionGroup>::type & type;
-};
-
-
-template<unsigned int d> struct cursor;
-
-
-template<unsigned int d>
-struct cursor
-{
-  static const unsigned int depth = d;
-
-  __host__ __device__ cursor() {}
-
-  cursor<depth+1> this_exec;
-
-  template<typename ExecutionGroup>
-  static __host__ __device__
-  typename cursor_result<cursor,ExecutionGroup>::type
-  get(ExecutionGroup &root)
-  {
-    return cursor<depth-1>::get(root.this_exec);
-  }
-};
-
-
-template<> struct cursor<3>
-{
-  static const unsigned int depth = 3;
-
-  __host__ __device__ cursor() {}
-
-  template<typename ExecutionGroup>
-  static __host__ __device__
-  typename cursor_result<cursor,ExecutionGroup>::type
-  get(ExecutionGroup &root)
-  {
-    return cursor<depth-1>::get(root.this_exec);
-  }
-};
-
-
-template<> struct cursor<0>
-{
-  static const unsigned int depth = 0;
-
-  __host__ __device__ cursor() {}
-
-  cursor<1> this_exec;
-
-  // the root level cursor simply returns the root
-  template<typename ExecutionAgent>
-  static __host__ __device__
-  ExecutionAgent &get(ExecutionAgent &root)
-  {
-    return root;
-  }
-};
-
-
-template<typename T> struct is_cursor : thrust::detail::false_type {};
-
-
-template<unsigned int d>
-struct is_cursor<cursor<d> >
-  : thrust::detail::true_type
-{};
-
-
-} // end detail
-
-
-#ifdef __CUDA_ARCH__
-static const __device__ detail::cursor<0> root;
-#else
-static const detail::cursor<0> root;
-#endif
-
-
-// shorthand for creating a parallel group of concurrent groups of agents
-inline __host__ __device__
-parallel_group<concurrent_group<> > grid(size_t num_groups = use_default, size_t group_size = use_default, size_t heap_size = use_default)
-{
-  return par(con(group_size,heap_size), num_groups);
-}
-               
-  
-
-
-inline __host__ __device__
-async_launch<
-  parallel_group<concurrent_group<> >
->
-  grid(size_t num_groups, size_t group_size, size_t heap_size, cudaStream_t stream)
-{
-  return par(stream, con(group_size,heap_size), num_groups);
-}
-
-
-template<std::size_t groupsize, std::size_t grainsize>
-__host__ __device__
-parallel_group<
-  concurrent_group<
-    bulk::agent<grainsize>,
-    groupsize
-  >
->
-  grid(size_t num_groups, size_t heap_size = use_default)
-{
-  return par(con<groupsize,grainsize>(heap_size), num_groups);
-}
-
-
-template<std::size_t groupsize, std::size_t grainsize>
-__host__ __device__
-async_launch<
-  parallel_group<
-    concurrent_group<
-      bulk::agent<grainsize>,
-      groupsize
-    >
-  >
->
-  grid(size_t num_groups, size_t heap_size, cudaStream_t stream)
-{
-  return par(stream, con<groupsize,grainsize>(heap_size), num_groups);
-}
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/future.hpp b/thrust/system/cuda/detail/bulk/future.hpp
deleted file mode 100644
index 0a017e4c4..000000000
--- a/thrust/system/cuda/detail/bulk/future.hpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/guarded_cuda_runtime_api.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/throw_on_error.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/terminate.hpp>
-#include <thrust/detail/swap.h>
-#include <utility>
-#include <stdexcept>
-#include <iostream>
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-namespace detail
-{
-
-
-struct future_core_access;
-
-
-} // end detail
-
-
-template<typename T> class future;
-
-
-template<>
-class future<void>
-{
-  public:
-    __host__ __device__
-    ~future()
-    {
-      if(valid())
-      {
-#if __BULK_HAS_CUDART__
-        // swallow errors
-        cudaError_t e = cudaEventDestroy(m_event);
-
-#if __BULK_HAS_PRINTF__
-        if(e)
-        {
-          printf("CUDA error after cudaEventDestroy in future dtor: %s", cudaGetErrorString(e));
-        } // end if
-#endif // __BULK_HAS_PRINTF__
-
-        if(m_owns_stream)
-        {
-          e = cudaStreamDestroy(m_stream);
-
-#if __BULK_HAS_PRINTF__
-          if(e)
-          {
-            printf("CUDA error after cudaStreamDestroy in future dtor: %s", cudaGetErrorString(e));
-          } // end if
-#endif // __BULK_HAS_PRINTF__
-        } // end if
-#endif
-      } // end if
-    } // end ~future()
-
-    __host__ __device__
-    void wait() const
-    {
-      // XXX should probably check for valid() here
-
-#if __BULK_HAS_CUDART__
-
-#ifndef __CUDA_ARCH__
-      // XXX need to capture the error as an exception and then throw it in .get()
-      bulk::detail::throw_on_error(cudaEventSynchronize(m_event), "cudaEventSynchronize in future::wait");
-#else
-      // XXX need to capture the error as an exception and then throw it in .get()
-      bulk::detail::throw_on_error(cudaDeviceSynchronize(), "cudaDeviceSynchronize in future::wait");
-#endif // __CUDA_ARCH__
-
-#else
-      // XXX should terminate with a message
-      bulk::detail::terminate();
-#endif // __BULK_HAS_CUDART__
-    } // end wait()
-
-    __host__ __device__
-    bool valid() const
-    {
-      return m_event != 0;
-    } // end valid()
-
-    __host__ __device__
-    future()
-      : m_stream(0), m_event(0), m_owns_stream(false)
-    {}
-
-    // simulate a move
-    // XXX need to add rval_ref or something
-    __host__ __device__
-    future(const future &other)
-      : m_stream(0), m_event(0), m_owns_stream(false)
-    {
-      thrust::swap(m_stream,      const_cast<future&>(other).m_stream);
-      thrust::swap(m_event,       const_cast<future&>(other).m_event);
-      thrust::swap(m_owns_stream, const_cast<future&>(other).m_owns_stream);
-    } // end future()
-
-    // simulate a move
-    // XXX need to add rval_ref or something
-    __host__ __device__
-    future &operator=(const future &other)
-    {
-      thrust::swap(m_stream,      const_cast<future&>(other).m_stream);
-      thrust::swap(m_event,       const_cast<future&>(other).m_event);
-      thrust::swap(m_owns_stream, const_cast<future&>(other).m_owns_stream);
-      return *this;
-    } // end operator=()
-
-  private:
-    friend struct detail::future_core_access;
-
-    __host__ __device__
-    future(cudaStream_t s, bool owns_stream)
-      : m_stream(s),m_owns_stream(owns_stream)
-    {
-#if __BULK_HAS_CUDART__
-      bulk::detail::throw_on_error(cudaEventCreateWithFlags(&m_event, create_flags), "cudaEventCreateWithFlags in future ctor");
-      bulk::detail::throw_on_error(cudaEventRecord(m_event, m_stream), "cudaEventRecord in future ctor");
-#endif
-    } // end future()
-
-    // XXX this combination makes the constructor expensive
-    //static const int create_flags = cudaEventDisableTiming | cudaEventBlockingSync;
-    static const int create_flags = cudaEventDisableTiming;
-
-    cudaStream_t m_stream;
-    cudaEvent_t m_event;
-    bool m_owns_stream;
-}; // end future<void>
-
-
-namespace detail
-{
-
-
-struct future_core_access
-{
-  __host__ __device__
-  inline static future<void> create(cudaStream_t s, bool owns_stream)
-  {
-    return future<void>(s, owns_stream);
-  } // end create_in_stream()
-
-  __host__ __device__
-  inline static cudaEvent_t event(const future<void> &f)
-  {
-    return f.m_event;
-  } // end event()
-}; // end future_core_access
-
-
-} // end detail
-
-
-} // end namespace bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp b/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp
deleted file mode 100644
index 0bb7af92b..000000000
--- a/thrust/system/cuda/detail/bulk/iterator/strided_iterator.hpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/iterator/iterator_adaptor.h>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename Iterator,
-         typename Size = typename thrust::iterator_difference<Iterator>::type>
-class strided_iterator
-  : public thrust::iterator_adaptor<
-      strided_iterator<Iterator>,
-      Iterator
-    >
-{
-  private:
-    typedef thrust::iterator_adaptor<strided_iterator<Iterator>,Iterator> super_t;
-
-  public:
-    typedef Size stride_type;
-
-    inline __host__ __device__
-    strided_iterator()
-      : super_t(), m_stride(1)
-    {}
-
-    inline __host__ __device__
-    strided_iterator(const strided_iterator& other)
-      : super_t(other), m_stride(other.m_stride)
-    {}
-
-    inline __host__ __device__
-    strided_iterator(const Iterator &base, stride_type stride)
-      : super_t(base), m_stride(stride)
-    {}
-
-    inline __host__ __device__
-    stride_type stride() const
-    {
-      return m_stride;
-    }
-
-  private:
-    friend class thrust::iterator_core_access;
-
-    __host__ __device__
-    void increment()
-    {
-      super_t::base_reference() += stride();
-    }
-
-    __host__ __device__
-    void decrement()
-    {
-      super_t::base_reference() -= stride();
-    }
-
-    __host__ __device__
-    void advance(typename super_t::difference_type n)
-    {
-      super_t::base_reference() += n * stride();
-    }
-
-    template<typename OtherIterator>
-    __host__ __device__
-    typename super_t::difference_type distance_to(const strided_iterator<OtherIterator> &other) const
-    {
-      if(other.base() >= this->base())
-      {
-        return (other.base() - this->base() + (stride() - 1)) / stride();
-      }
-
-      return (other.base() - this->base() - (stride() - 1)) / stride();
-    }
-
-    stride_type m_stride;
-};
-
-
-template<typename Iterator, typename Size>
-__host__ __device__
-strided_iterator<Iterator,Size> make_strided_iterator(Iterator iter, Size stride)
-{
-  return strided_iterator<Iterator,Size>(iter, stride);
-}
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/malloc.hpp b/thrust/system/cuda/detail/bulk/malloc.hpp
deleted file mode 100644
index 3444385a5..000000000
--- a/thrust/system/cuda/detail/bulk/malloc.hpp
+++ /dev/null
@@ -1,600 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/pointer_traits.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/alignment.hpp>
-#include <thrust/system/cuda/detail/bulk/uninitialized.hpp>
-#include <thrust/detail/config.h>
-#include <cstdlib>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-inline __device__ bool is_on_chip(void *ptr)
-{
-  return bulk::detail::is_shared(ptr);
-} // end is_on_chip()
-
-
-template<typename T>
-inline __device__ T *on_chip_cast(T *ptr)
-{
-  extern __shared__ char s_begin[];
-  void *result = (reinterpret_cast<char*>(ptr) - s_begin) + s_begin;
-  return reinterpret_cast<T*>(result);
-} // end on_chip_cast()
-
-
-namespace detail
-{
-
-
-extern __shared__ int s_data_segment_begin[];
-
-
-class os
-{
-  public:
-    __device__ inline os(size_t max_data_segment_size)
-      : m_program_break(s_data_segment_begin),
-        m_max_data_segment_size(max_data_segment_size)
-    {
-    }
-
-
-    __device__ inline int brk(void *end_data_segment)
-    {
-      if(end_data_segment <= m_program_break)
-      {
-        m_program_break = end_data_segment;
-        return 0;
-      }
-
-      return -1;
-    }
-
-
-    __device__ inline void *sbrk(size_t increment)
-    {
-      if(data_segment_size() + increment <= m_max_data_segment_size)
-      {
-        m_program_break = reinterpret_cast<char*>(m_program_break) + increment;
-      } // end if
-      else
-      {
-        return reinterpret_cast<void*>(-1);
-      } // end else
-
-      return m_program_break;
-    }
-
-
-    __device__ inline void *program_break() const
-    {
-      return m_program_break;
-    }
-
-    
-    __device__ inline void *data_segment_begin() const
-    {
-      return s_data_segment_begin;
-    }
-
-
-  private:
-    __device__ inline size_t data_segment_size()
-    {
-      return reinterpret_cast<char*>(m_program_break) - reinterpret_cast<char*>(s_data_segment_begin);
-    } // end data_segment_size()
-
-
-    void *m_program_break;
-
-    // XXX this can safely be uint32
-    size_t m_max_data_segment_size;
-};
-
-
-// only one instance of this class can logically exist per CTA, and its use is thread-unsafe
-class singleton_unsafe_on_chip_allocator
-{
-  public:
-    __device__ inline singleton_unsafe_on_chip_allocator(size_t max_data_segment_size)
-      : m_os(max_data_segment_size)
-    {}
-  
-    __device__ inline void *allocate(size_t size)
-    {
-      size_t aligned_size = align8(size);
-    
-      block *prev = find_first_free_insertion_point(heap_begin(), heap_end(), aligned_size);
-    
-      block *b;
-    
-      if(prev != heap_end() && (b = prev->next()) != heap_end())
-      {
-        // can we split?
-        if((b->size() - aligned_size) >= sizeof(block))
-        {
-          split_block(b, aligned_size);
-        } // end if
-    
-        b->set_is_free(false);
-      } // end if
-      else
-      {
-        // nothing fits, extend the heap
-        b = extend_heap(prev, aligned_size);
-        if(b == heap_end())
-        {
-          return 0;
-        } // end if
-      } // end else
-    
-      return b->data();
-    } // end allocate()
-  
-  
-    __device__ inline void deallocate(void *ptr)
-    {
-      if(ptr != 0)
-      {
-        block *b = get_block(ptr);
-    
-        // free the block
-        b->set_is_free(true);
-    
-        // try to fuse the freed block the previous block
-        if(b->prev() && b->prev()->is_free())
-        {
-          b = b->prev();
-          fuse_block(b);
-        } // end if
-    
-        // now try to fuse with the next block
-        if(b->next() != heap_end())
-        {
-          fuse_block(b);
-        } // end if
-        else
-        {
-          // the the OS know where the new break is
-          m_os.brk(b);
-        } // end else
-      } // end if
-    } // end deallocate()
-
-
-  private:
-    // align to two words
-    class block : public bulk::detail::aligned_type<sizeof(size_t) + sizeof(block*)>::type
-    {
-      public:
-        __device__ inline size_t size() const
-        {
-          return m_size;
-        } // end size()
-
-        __device__ void set_size(size_t sz)
-        {
-          m_size = sz;
-        } // end set_size()
-
-        __device__ inline block *prev() const
-        {
-          return m_prev;
-        } // end prev()
-
-        __device__ void set_prev(block *p)
-        {
-          m_prev = p;
-        } // end set_prev()
-
-        // returns a pointer to the indexth byte within this block's data
-        __device__ inline void *byte_at(size_t index) const
-        {
-          return reinterpret_cast<char*>(data()) + index;
-        } // end byte_at()
-
-        __device__ inline block *next() const
-        {
-          return reinterpret_cast<block*>(byte_at(size()));
-        } // end next()
-
-        __device__ inline bool is_free() const
-        {
-          return m_is_free;
-        } // end is_free()
-
-        __device__ inline void set_is_free(bool f)
-        {
-          m_is_free = f;
-        } // end set_is_free()
-
-        __device__ inline void *data() const
-        {
-          return reinterpret_cast<char*>(const_cast<block*>(this)) + sizeof(block);
-        } // end data()
-
-      private:
-        // this packing ensures that sizeof(block) is compatible with 64b alignment, because:
-        // on a 32b platform, sizeof(block) == 64b
-        // on a 64b platform, sizeof(block) == 128b
-        bool   m_is_free : 1;
-        size_t m_size    : 8 * sizeof(size_t) - 1;
-        block *m_prev;
-    };
-  
-  
-    os     m_os;
-
-    __device__ inline block *heap_begin() const
-    {
-      return reinterpret_cast<block*>(m_os.data_segment_begin());
-    } // end heap_begin()
-
-
-    __device__ inline block *heap_end() const
-    {
-      return reinterpret_cast<block*>(m_os.program_break());
-    } // end heap_end();
-  
-  
-    __device__ inline void split_block(block *b, size_t size)
-    {
-      block *new_block;
-    
-      // emplace a new block within the old one's data segment
-      new_block = reinterpret_cast<block*>(b->byte_at(size));
-    
-      // the new block's size is the old block's size less the size of the split less the size of a block
-      new_block->set_size(b->size() - size - sizeof(block));
-    
-      new_block->set_prev(b);
-      new_block->set_is_free(true);
-    
-      // the old block's size is the size of the split
-      b->set_size(size);
-    
-      // link the old block to the new one
-      if(new_block->next() != heap_end())
-      {
-        new_block->next()->set_prev(new_block);
-      } // end if
-    } // end split_block()
-  
-  
-    __device__ inline bool fuse_block(block *b)
-    {
-      if(b->next() != heap_end() && b->next()->is_free())
-      {
-        // increment b's size by sizeof(block) plus the next's block's data size
-        b->set_size(sizeof(block) + b->next()->size() + b->size());
-    
-        if(b->next() != heap_end())
-        {
-          b->next()->set_prev(b);
-        }
-    
-        return true;
-      }
-    
-      return false;
-    } // end fuse_block()
-  
-  
-    __device__ inline static block *get_block(void *data)
-    {
-      // the block metadata lives sizeof(block) bytes to the left of data
-      void *ptr = reinterpret_cast<char*>(data) - sizeof(block);
-      return reinterpret_cast<block *>(ptr);
-    } // end get_block()
-  
-  
-    __device__ inline static block *find_first_free_insertion_point(block *first, block *last, size_t size)
-    {
-      block *prev = last;
-    
-      while(first != last && !(first->is_free() && first->size() >= size))
-      {
-        prev = first;
-        first = first->next();
-      }
-    
-      return prev;
-    } // end find_first_free_insertion_point()
-  
-  
-    __device__ inline block *extend_heap(block *prev, size_t size)
-    {
-      // the new block goes at the current end of the heap
-      block *new_block = heap_end();
-    
-      // move the break to the right to accomodate both a block and the requested allocation
-      if(m_os.sbrk(sizeof(block) + size) == reinterpret_cast<void*>(-1))
-      {
-        // allocation failed
-        return new_block;
-      }
-    
-      on_chip_cast(new_block)->set_size(size);
-      on_chip_cast(new_block)->set_prev(prev);
-      on_chip_cast(new_block)->set_is_free(false);
-    
-      return new_block;
-    } // end extend_heap()
-  
-  
-    __device__ inline static size_t align8(size_t size)
-    {
-      return ((((size - 1) >> 3) << 3) + 8);
-    } // end align4()
-}; // end singleton_unsafe_on_chip_allocator
-
-
-class singleton_on_chip_allocator
-{
-  public:
-    // XXX mark as __host__ to WAR a warning from uninitialized.construct
-    inline __device__ __host__
-    singleton_on_chip_allocator(size_t max_data_segment_size)
-      : m_mutex(),
-        m_alloc(max_data_segment_size)
-    {}
-
-
-    inline __device__
-    void *unsafe_allocate(size_t size)
-    {
-      return m_alloc.allocate(size);
-    }
-
-
-    inline __device__
-    void *allocate(size_t size)
-    {
-      void *result;
-
-      m_mutex.lock();
-      {
-        result = unsafe_allocate(size);
-      } // end critical section
-      m_mutex.unlock();
-
-      return result;
-    } // end allocate()
-
-
-    inline __device__
-    void unsafe_deallocate(void *ptr)
-    {
-      m_alloc.deallocate(ptr);
-    } // end unsafe_deallocate()
-
-
-    inline __device__
-    void deallocate(void *ptr)
-    {
-      m_mutex.lock();
-      {
-        unsafe_deallocate(ptr);
-      } // end critical section
-      m_mutex.unlock();
-    } // end deallocate()
-
-
-  private:
-    class mutex
-    {
-      public:
-        inline __device__
-        mutex()
-          : m_in_use(0)
-        {}
-
-
-        inline __device__
-        bool try_lock()
-        {
-#if __CUDA_ARCH__ >= 110
-          return atomicCAS(&m_in_use, 0, 1) != 0;
-#else
-          return false;
-#endif
-        } // end try_lock()
-
-
-        inline __device__
-        void lock()
-        {
-          // spin while waiting
-          while(try_lock())
-          {
-            ;
-          }
-        } // end lock()
-
-
-        inline __device__
-        void unlock()
-        {
-          m_in_use = 0;
-        } // end unlock()
-
-
-      private:
-        unsigned int m_in_use;
-    }; // end mutex
-
-
-    mutex m_mutex;
-    singleton_unsafe_on_chip_allocator m_alloc;
-}; // end singleton_on_chip_allocator
-
-
-// put the object in an anonymous namespace so that non-CUDA compilers don't complain about multiple definitions
-namespace
-{
-
-__shared__  uninitialized<singleton_on_chip_allocator> s_on_chip_allocator;
-
-} // end anon namespace
-
-
-inline __device__ void init_on_chip_malloc(size_t max_data_segment_size)
-{
-  s_on_chip_allocator.construct(max_data_segment_size);
-} // end init_on_chip_malloc()
-
-
-inline __device__ void *on_chip_malloc(size_t size)
-{
-  void *result = s_on_chip_allocator.get().allocate(size);
-  return on_chip_cast(result);
-} // end on_chip_malloc()
-
-
-inline __device__ void on_chip_free(void *ptr)
-{
-  s_on_chip_allocator.get().deallocate(ptr);
-} // end on_chip_free()
-
-
-inline __device__ void *unsafe_on_chip_malloc(size_t size)
-{
-  void *result = s_on_chip_allocator.get().unsafe_allocate(size);
-  return on_chip_cast(result);
-} // end unsafe_on_chip_malloc()
-
-
-inline __device__ void unsafe_on_chip_free(void *ptr)
-{
-  s_on_chip_allocator.get().unsafe_deallocate(ptr);
-} // end unsafe_on_chip_free()
-
-
-} // end detail
-
-
-inline __device__ void *shmalloc(size_t num_bytes)
-{
-  // first try on_chip_malloc
-  void *result = detail::on_chip_malloc(num_bytes);
-  
-#if __CUDA_ARCH__ >= 200
-  if(!result)
-  {
-    result = std::malloc(num_bytes);
-  } // end if
-#endif // __CUDA_ARCH__
-
-  return result;
-} // end shmalloc()
-
-
-inline __device__ void *unsafe_shmalloc(size_t num_bytes)
-{
-  // first try on_chip_malloc
-  void *result = detail::unsafe_on_chip_malloc(num_bytes);
-  
-#if __CUDA_ARCH__ >= 200
-  if(!result)
-  {
-    result = std::malloc(num_bytes);
-  } // end if
-#endif // __CUDA_ARCH__
-
-  return result;
-} // end unsafe_shmalloc()
-
-
-inline __device__ void shfree(void *ptr)
-{
-#if __CUDA_ARCH__ >= 200
-  if(bulk::is_on_chip(ptr))
-  {
-    bulk::detail::on_chip_free(bulk::on_chip_cast(ptr));
-  } // end if
-  else
-  {
-    std::free(ptr);
-  } // end else
-#else
-  bulk::detail::on_chip_free(bulk::on_chip_cast(ptr));
-#endif
-} // end shfree()
-
-
-inline __device__ void unsafe_shfree(void *ptr)
-{
-#if __CUDA_ARCH__ >= 200
-  if(bulk::is_on_chip(ptr))
-  {
-    bulk::detail::unsafe_on_chip_free(bulk::on_chip_cast(ptr));
-  } // end if
-  else
-  {
-    std::free(ptr);
-  } // end else
-#else
-  bulk::detail::unsafe_on_chip_free(bulk::on_chip_cast(ptr));
-#endif
-} // end unsafe_shfree()
-
-
-template<typename ConcurrentGroup>
-__device__
-inline void *malloc(ConcurrentGroup &g, size_t num_bytes)
-{
-  __shared__ void *s_result;
-
-  // we need to guard access to s_result from other
-  // invocations of malloc, so we put a wait at the beginning
-  g.wait();
-
-  if(g.this_exec.index() == 0)
-  {
-    s_result = bulk::unsafe_shmalloc(num_bytes);
-  } // end if
-
-  g.wait();
-
-  return s_result;
-} // end malloc()
-
-
-template<typename ConcurrentGroup>
-__device__
-inline void free(ConcurrentGroup &g, void *ptr)
-{
-  if(g.this_exec.index() == 0)
-  {
-    bulk::unsafe_shfree(ptr);
-  } // end if
-
-  g.wait();
-} // end free()
-
-
-} // end namespace bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/bulk/uninitialized.hpp b/thrust/system/cuda/detail/bulk/uninitialized.hpp
deleted file mode 100644
index 5659bdc48..000000000
--- a/thrust/system/cuda/detail/bulk/uninitialized.hpp
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/system/cuda/detail/bulk/detail/config.hpp>
-#include <thrust/system/cuda/detail/bulk/detail/alignment.hpp>
-#include <cstddef>
-#include <new>
-
-
-BULK_NAMESPACE_PREFIX
-namespace bulk
-{
-
-
-template<typename T>
-  class uninitialized
-{
-  private:
-    typename bulk::detail::aligned_storage<
-      sizeof(T),
-      bulk::detail::alignment_of<T>::value
-    >::type storage;
-
-    __host__ __device__ __thrust_forceinline__
-    const T* ptr() const
-    {
-      const void *result = storage.data;
-      return reinterpret_cast<const T*>(result);
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    T* ptr()
-    {
-      void *result = storage.data;
-      return reinterpret_cast<T*>(result);
-    }
-
-  public:
-    // copy assignment
-    __host__ __device__ __thrust_forceinline__
-    uninitialized<T> &operator=(const T &other)
-    {
-      T& self = *this;
-      self = other;
-      return *this;
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    T& get()
-    {
-      return *ptr();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    const T& get() const
-    {
-      return *ptr();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    operator T& ()
-    {
-      return get();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    operator const T&() const
-    {
-      return get();
-    }
-
-    __bulk_hd_warning_disable__
-    __host__ __device__ __thrust_forceinline__
-    void construct()
-    {
-      ::new(ptr()) T();
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg &a)
-    {
-      ::new(ptr()) T(a);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2)
-    {
-      ::new(ptr()) T(a1,a2);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3)
-    {
-      ::new(ptr()) T(a1,a2,a3);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9);
-    }
-
-    __bulk_hd_warning_disable__
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10);
-    }
-
-    __bulk_hd_warning_disable__
-    __host__ __device__ __thrust_forceinline__
-    void destroy()
-    {
-      T& self = *this;
-      self.~T();
-    }
-};
-
-
-template<typename T, std::size_t N>
-  class uninitialized_array
-{
-  public:
-    typedef T             value_type; 
-    typedef T&            reference;
-    typedef const T&      const_reference;
-    typedef T*            pointer;
-    typedef const T*      const_pointer;
-    typedef pointer       iterator;
-    typedef const_pointer const_iterator;
-    typedef std::size_t   size_type;
-
-    __thrust_forceinline__ __host__ __device__
-    iterator begin()
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator begin() const
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    iterator end()
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator end() const
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator cbegin() const
-    {
-      return begin();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator cend() const
-    {
-      return end();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    size_type size() const
-    {
-      return N;
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    bool empty() const
-    {
-      return false;
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    T* data()
-    {
-      return impl.get();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const T* data() const
-    {
-      return impl.get();
-    }
-
-    // element access
-    __thrust_forceinline__ __host__ __device__
-    reference operator[](size_type n)
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference operator[](size_type n) const
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    reference front()
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference front() const
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    reference back()
-    {
-      return data()[size() - size_type(1)];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference back() const
-    {
-      return data()[size() - size_type(1)];
-    }
-
-  private:
-    uninitialized<T[N]> impl;
-};
-
-
-} // end bulk
-BULK_NAMESPACE_SUFFIX
-
diff --git a/thrust/system/cuda/detail/cdp_dispatch.h b/thrust/system/cuda/detail/cdp_dispatch.h
new file mode 100644
index 000000000..f94e5dd92
--- /dev/null
+++ b/thrust/system/cuda/detail/cdp_dispatch.h
@@ -0,0 +1,88 @@
+/*
+*  Copyright 2021-2022 NVIDIA Corporation
+*
+*  Licensed under the Apache License, Version 2.0 (the "License");
+*  you may not use this file except in compliance with the License.
+*  You may obtain a copy of the License at
+*
+*      http://www.apache.org/licenses/LICENSE-2.0
+*
+*  Unless required by applicable law or agreed to in writing, software
+*  distributed under the License is distributed on an "AS IS" BASIS,
+*  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*  See the License for the specific language governing permissions and
+*  limitations under the License.
+*/
+
+/**
+ * \file
+ * Utilities for CUDA dynamic parallelism.
+ */
+
+#pragma once
+
+#include <cub/config.cuh>
+#include <cub/detail/detect_cuda_runtime.cuh>
+
+#include <nv/target>
+
+/**
+ * \def THRUST_CDP_DISPATCH
+ *
+ * If CUDA Dynamic Parallelism / CUDA Nested Parallelism is available, always
+ * run the parallel implementation. Otherwise, run the parallel implementation
+ * when called from the host, and fallback to the sequential implementation on
+ * the device.
+ *
+ * `par_impl` and `seq_impl` are blocks of C++ statements enclosed in
+ * parentheses, similar to NV_IF_TARGET blocks:
+ *
+ * \code
+ * THRUST_CDP_DISPATCH((launch_parallel_kernel();), (run_serial_impl();));
+ * \endcode
+ */
+
+#if defined(CUB_DETAIL_CDPv1)
+
+// Special case for NVCC -- need to inform the device path about the kernels
+// that are launched from the host path.
+#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// seq_impl only used on platforms that do not support device synchronization.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  if (false)                                                                   \
+  { /* Without this, the device pass won't compile any kernels. */             \
+    NV_IF_TARGET(NV_ANY_TARGET, par_impl);                                     \
+  }                                                                            \
+  NV_IF_TARGET(NV_PROVIDES_SM_90, seq_impl, par_impl)
+
+#else // NVCC device pass
+
+// seq_impl only used on platforms that do not support device synchronization.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_PROVIDES_SM_90, seq_impl, par_impl)
+
+#endif // NVCC device pass
+
+#else // CDPv1 unavailable. Always fallback to serial on device:
+
+// Special case for NVCC -- need to inform the device path about the kernels
+// that are launched from the host path.
+#if defined(__CUDACC__) && defined(__CUDA_ARCH__)
+
+// Device-side launch not supported, fallback to sequential in device code.
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  if (false)                                                                   \
+  { /* Without this, the device pass won't compile any kernels. */             \
+    NV_IF_TARGET(NV_ANY_TARGET, par_impl);                                     \
+  }                                                                            \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#else // !(NVCC device pass):
+
+#define THRUST_CDP_DISPATCH(par_impl, seq_impl)                                \
+  NV_IF_TARGET(NV_IS_HOST, par_impl, seq_impl)
+
+#endif // NVCC device pass
+
+#endif // CDP version
diff --git a/thrust/system/cuda/detail/copy.h b/thrust/system/cuda/detail/copy.h
index 0a4ddea83..02a5d2ac1 100644
--- a/thrust/system/cuda/detail/copy.h
+++ b/thrust/system/cuda/detail/copy.h
@@ -1,81 +1,184 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
 
-namespace thrust
+#include <thrust/advance.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/cross_system.h>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename DerivedPolicy, typename InputIt, typename OutputIt>
+__host__ __device__ OutputIt
+copy(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+     InputIt                                                     first,
+     InputIt                                                     last,
+     OutputIt                                                    result);
+
+template <class DerivedPolicy, class InputIt, class Size, class OutputIt>
+__host__ __device__ OutputIt
+copy_n(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+       InputIt                                                     first,
+       Size                                                        n,
+       OutputIt                                                    result);
+
+namespace cuda_cub {
+
+// D->D copy requires NVCC compiler
+template <class System,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy(execution_policy<System> &system,
+     InputIterator             first,
+     InputIterator             last,
+     OutputIterator            result);
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__
+copy(cross_system<System1, System2> systems,
+     InputIterator  first,
+     InputIterator  last,
+     OutputIterator result);
+
+template <class System,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy_n(execution_policy<System> &system,
+       InputIterator             first,
+       Size                      n,
+       OutputIterator            result);
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__
+copy_n(cross_system<System1, System2> systems,
+       InputIterator  first,
+       Size           n,
+       OutputIterator result);
+
+}    // namespace cuda_
+THRUST_NAMESPACE_END
+
+
+
+#include <thrust/system/cuda/detail/internal/copy_device_to_device.h>
+#include <thrust/system/cuda/detail/internal/copy_cross_system.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+// D->D copy requires NVCC compiler
+
+__thrust_exec_check_disable__
+template <class System,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy(execution_policy<System> &system,
+     InputIterator             first,
+     InputIterator             last,
+     OutputIterator            result)
 {
-namespace system
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system, first, last, result);),
+    (result =
+       thrust::copy(cvt_to_seq(derived_cast(system)), first, last, result);));
+  return result;
+}    // end copy()
+
+__thrust_exec_check_disable__
+template <class System,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__ __device__
+copy_n(execution_policy<System> &system,
+       InputIterator             first,
+       Size                      n,
+       OutputIterator            result)
 {
-namespace cuda
+  THRUST_CDP_DISPATCH(
+    (result = __copy::device_to_device(system,
+                                       first,
+                                       thrust::next(first, n),
+                                       result);),
+    (result =
+       thrust::copy_n(cvt_to_seq(derived_cast(system)), first, n, result);));
+  return result;
+} // end copy_n()
+#endif
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class OutputIterator>
+OutputIterator __host__
+copy(cross_system<System1, System2> systems,
+     InputIterator  first,
+     InputIterator  last,
+     OutputIterator result)
 {
-namespace detail
+  return __copy::cross_system_copy(systems,first,last,result);
+} // end copy()
+
+template <class System1,
+          class System2,
+          class InputIterator,
+          class Size,
+          class OutputIterator>
+OutputIterator __host__
+copy_n(cross_system<System1, System2> systems,
+       InputIterator  first,
+       Size           n,
+       OutputIterator result)
 {
+  return __copy::cross_system_copy_n(systems, first, n, result);
+} // end copy_n()
 
 
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy(execution_policy<DerivedPolicy> &exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(cross_system<System1,System2> exec,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(cross_system<System1,System2> exec,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/copy.inl>
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
 
+#include <thrust/memory.h>
+#include <thrust/detail/temporary_array.h>
diff --git a/thrust/system/cuda/detail/copy.inl b/thrust/system/cuda/detail/copy.inl
deleted file mode 100644
index 1969c1335..000000000
--- a/thrust/system/cuda/detail/copy.inl
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/copy_device_to_device.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename System,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy(execution_policy<System> &system,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_device_to_device(system,first,last,result);
-} // end copy()
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-OutputIterator copy(cross_system<System1,System2> systems,
-                    InputIterator first,
-                    InputIterator last,
-                    OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_cross_system(systems,first,last,result);
-} // end copy()
-
-
-template<typename System,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_n(execution_policy<System> &system,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_device_to_device(system,first,first+n,result);
-} // end copy_n()
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-OutputIterator copy_n(cross_system<System1,System2> systems,
-                      InputIterator first,
-                      Size n,
-                      OutputIterator result)
-{
-  return thrust::system::cuda::detail::copy_cross_system_n(systems,first,n,result);
-} // end copy_n()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/copy_cross_system.h b/thrust/system/cuda/detail/copy_cross_system.h
deleted file mode 100644
index a89aedd66..000000000
--- a/thrust/system/cuda/detail/copy_cross_system.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   InputIterator begin, 
-                                   InputIterator end, 
-                                   OutputIterator result);
-
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     InputIterator begin, 
-                                     Size n, 
-                                     OutputIterator result);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/copy_cross_system.inl>
-
diff --git a/thrust/system/cuda/detail/copy_cross_system.inl b/thrust/system/cuda/detail/copy_cross_system.inl
deleted file mode 100644
index 8a2396755..000000000
--- a/thrust/system/cuda/detail/copy_cross_system.inl
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-#include <thrust/detail/copy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/dispatch/is_trivial_copy.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// XXX WAR circular #inclusion problem
-template<typename,typename> class temporary_array;
-
-} // end detail
-
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// general input to random access case
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename RandomAccessIterator>
-  RandomAccessIterator copy_cross_system(cross_system<System1,System2> systems,
-                                         InputIterator begin,
-                                         InputIterator end,
-                                         RandomAccessIterator result,
-                                         thrust::incrementable_traversal_tag, 
-                                         thrust::random_access_traversal_tag)
-{
-  //std::cerr << std::endl;
-  //std::cerr << "general copy_host_to_device(): InputIterator: " << typeid(InputIterator).name() << std::endl;
-  //std::cerr << "general copy_host_to_device(): OutputIterator: " << typeid(OutputIterator).name() << std::endl;
-
-  typedef typename thrust::iterator_value<InputIterator>::type InputType;
-
-  // allocate temporary storage in System1
-  thrust::detail::temporary_array<InputType, System1> temp(systems.system1,begin,end);
-  return thrust::copy(systems, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename RandomAccessIterator>
-  RandomAccessIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                           InputIterator first,
-                                           Size n,
-                                           RandomAccessIterator result,
-                                           thrust::incrementable_traversal_tag, 
-                                           thrust::random_access_traversal_tag)
-{
-  typedef typename thrust::iterator_value<InputIterator>::type InputType;
-
-  // allocate and copy to temporary storage System1
-  thrust::detail::temporary_array<InputType, System1> temp(systems.system1, first, n);
-
-  // recurse
-  return copy_cross_system(systems, temp.begin(), temp.end(), result);
-}
-
-
-// random access to general output case
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   RandomAccessIterator begin,
-                                   RandomAccessIterator end,
-                                   OutputIterator result,
-                                   thrust::random_access_traversal_tag, 
-                                   thrust::incrementable_traversal_tag)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
-
-  // copy to temporary storage in System2
-  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, begin, end);
-
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     RandomAccessIterator first,
-                                     Size n,
-                                     OutputIterator result,
-                                     thrust::random_access_traversal_tag, 
-                                     thrust::incrementable_traversal_tag)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type InputType;
-
-  // copy to temporary storage in System2
-  thrust::detail::temporary_array<InputType,System2> temp(systems.system2, systems.system1, first, n);
-
-  // copy temp to result
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-
-// trivial copy
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::detail::true_type) // trivial copy
-{
-//  std::cerr << std::endl;
-//  std::cerr << "random access copy_device_to_host(): trivial" << std::endl;
-//  std::cerr << "general copy_device_to_host(): RandomAccessIterator1: " << typeid(RandomAccessIterator1).name() << std::endl;
-//  std::cerr << "general copy_device_to_host(): RandomAccessIterator2: " << typeid(RandomAccessIterator2).name() << std::endl;
-  
-  // how many elements to copy?
-  typename thrust::iterator_traits<RandomAccessIterator1>::difference_type n = end - begin;
-
-  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, result);
-
-  return result + n;
-}
-
-
-namespace detail
-{
-
-// random access non-trivial iterator to random access iterator
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
-                                                                    RandomAccessIterator1 begin,
-                                                                    RandomAccessIterator1 end,
-                                                                    RandomAccessIterator2 result,
-                                                                    thrust::detail::false_type) // InputIterator is non-trivial
-{
-  // copy the input to a temporary input system buffer of OutputType
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type OutputType;
-
-  // allocate temporary storage in System1
-  thrust::detail::temporary_array<OutputType,System1> temp(systems.system1, begin, end);
-
-  // recurse
-  return copy_cross_system(systems, temp.begin(), temp.end(), result);
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 non_trivial_random_access_copy_cross_system(cross_system<System1,System2> systems,
-                                                                    RandomAccessIterator1 begin,
-                                                                    RandomAccessIterator1 end,
-                                                                    RandomAccessIterator2 result,
-                                                                    thrust::detail::true_type) // InputIterator is trivial
-{
-  typename thrust::iterator_difference<RandomAccessIterator1>::type n = thrust::distance(begin, end);
-
-  // allocate temporary storage in System2
-  // retain the input's type for the intermediate storage
-  // do not initialize the storage (the 0 does this)
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type InputType;
-  thrust::detail::temporary_array<InputType,System2> temp(0, systems.system2, n);
-
-  // force a trivial (memcpy) copy of the input to the temporary
-  // note that this will not correctly account for copy constructors
-  // but there's nothing we can do about that
-  // XXX one thing we might try is to use pinned memory for the temporary storage
-  //     this might allow us to correctly account for copy constructors
-  thrust::system::cuda::detail::trivial_copy_n(systems, begin, n, temp.begin());
-
-  // finally, copy to the result
-  return thrust::copy(systems.system2, temp.begin(), temp.end(), result);
-}
-
-} // end detail
-
-
-// random access iterator to random access host iterator with non-trivial copy
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::random_access_traversal_tag,
-                                          thrust::detail::false_type) // is_trivial_copy
-{
-  // dispatch a non-trivial random access cross system copy based on whether or not the InputIterator is trivial
-  return detail::non_trivial_random_access_copy_cross_system(systems, begin, end, result,
-      typename thrust::detail::is_trivial_iterator<RandomAccessIterator1>::type());
-}
-
-// random access iterator to random access iterator
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system(cross_system<System1,System2> systems,
-                                          RandomAccessIterator1 begin,
-                                          RandomAccessIterator1 end,
-                                          RandomAccessIterator2 result,
-                                          thrust::random_access_traversal_tag input_traversal,
-                                          thrust::random_access_traversal_tag output_traversal)
-{
-  // dispatch on whether this is a trivial copy
-  return copy_cross_system(systems, begin, end, result, input_traversal, output_traversal,
-          typename thrust::detail::dispatch::is_trivial_copy<RandomAccessIterator1,RandomAccessIterator2>::type());
-}
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-  RandomAccessIterator2 copy_cross_system_n(cross_system<System1,System2> systems,
-                                            RandomAccessIterator1 first,
-                                            Size n,
-                                            RandomAccessIterator2 result,
-                                            thrust::random_access_traversal_tag input_traversal,
-                                            thrust::random_access_traversal_tag output_traversal)
-{
-  // implement with copy_cross_system
-  return copy_cross_system(systems, first, first + n, result, input_traversal, output_traversal);
-}
-
-/////////////////
-// Entry Point //
-/////////////////
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename OutputIterator>
-  OutputIterator copy_cross_system(cross_system<System1,System2> systems,
-                                   InputIterator begin, 
-                                   InputIterator end, 
-                                   OutputIterator result)
-{
-  return copy_cross_system(systems, begin, end, result, 
-          typename thrust::iterator_traversal<InputIterator>::type(),
-          typename thrust::iterator_traversal<OutputIterator>::type());
-}
-
-template<typename System1,
-         typename System2,
-         typename InputIterator,
-         typename Size,
-         typename OutputIterator>
-  OutputIterator copy_cross_system_n(cross_system<System1,System2> systems,
-                                     InputIterator begin, 
-                                     Size n, 
-                                     OutputIterator result)
-{
-  return copy_cross_system_n(systems, begin, n, result, 
-          typename thrust::iterator_traversal<InputIterator>::type(),
-          typename thrust::iterator_traversal<OutputIterator>::type());
-}
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/copy_device_to_device.h b/thrust/system/cuda/detail/copy_device_to_device.h
deleted file mode 100644
index 2d04bc37b..000000000
--- a/thrust/system/cuda/detail/copy_device_to_device.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file copy_device_to_device.h
- *  \brief Device implementations for copying on the device.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator begin, 
-                                     InputIterator end, 
-                                     OutputIterator result);
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/copy_device_to_device.inl>
-
diff --git a/thrust/system/cuda/detail/copy_device_to_device.inl b/thrust/system/cuda/detail/copy_device_to_device.inl
deleted file mode 100644
index 8bff8aff2..000000000
--- a/thrust/system/cuda/detail/copy_device_to_device.inl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/copy_device_to_device.h>
-#include <thrust/system/cuda/detail/copy_cross_system.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/distance.h>
-#include <thrust/transform.h>
-#include <thrust/functional.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator begin, 
-                                     InputIterator end, 
-                                     OutputIterator result,
-                                     thrust::detail::false_type)
-{
-    // general case (mixed types)
-    typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-    return thrust::transform(exec, begin, end, result, thrust::identity<InputType>());
-#else
-    // we're not compiling with nvcc: copy [begin, end) to temp host memory
-    typename thrust::iterator_traits<InputIterator>::difference_type n = thrust::distance(begin, end);
-
-    thrust::host_system_tag temp_exec;
-    thrust::detail::temporary_array<InputType, thrust::host_system_tag> temp1(temp_exec, begin, end);
-
-    // transform temp1 to OutputType in host memory
-    typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-
-    thrust::detail::temporary_array<OutputType, thrust::host_system_tag> temp2(temp_exec, temp1.begin(), temp1.end());
-
-    // copy temp2 to device
-    result = thrust::system::cuda::detail::copy_cross_system(temp2.begin(), temp2.end(), result);
-
-    return result;
-#endif // THRUST_DEVICE_COMPILER_NVCC
-}
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator begin, 
-                                     InputIterator end, 
-                                     OutputIterator result,
-                                     thrust::detail::true_type)
-{
-  // specialization for device to device when the value_types match, operator= is not overloaded,
-  // and the iterators are pointers
-  
-  // how many elements to copy?
-  typename thrust::iterator_traits<OutputIterator>::difference_type n = end - begin;
-  
-  thrust::system::cuda::detail::trivial_copy_n(exec, begin, n, result);
-  
-  return result + n;
-}
-
-
-} // end namespace detail
-
-
-/////////////////
-// Entry Point //
-/////////////////
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator>
-__host__ __device__
-OutputIterator copy_device_to_device(execution_policy<DerivedPolicy> &exec,
-                                     InputIterator begin, 
-                                     InputIterator end, 
-                                     OutputIterator result)
-{
-  typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  
-  const bool use_trivial_copy = 
-      thrust::detail::is_same<InputType, OutputType>::value
-      && thrust::detail::is_trivial_iterator<InputIterator>::value 
-      && thrust::detail::is_trivial_iterator<OutputIterator>::value;
-  
-  // XXX WAR unused variable warning
-  (void) use_trivial_copy;
-  
-  return detail::copy_device_to_device(exec, begin, end, result,
-          thrust::detail::integral_constant<bool, use_trivial_copy>());
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/copy_if.h b/thrust/system/cuda/detail/copy_if.h
index 201a9ae74..5e760c086 100644
--- a/thrust/system/cuda/detail/copy_if.h
+++ b/thrust/system/cuda/detail/copy_if.h
@@ -1,52 +1,831 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/function.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
+// XXX declare generic copy_if interface
+// to avoid circulular dependency from thrust/copy.h
+template <typename DerivedPolicy, typename InputIterator, typename OutputIterator, typename Predicate>
+__host__ __device__
+    OutputIterator
+    copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            InputIterator                                               first,
+            InputIterator                                               last,
+            OutputIterator                                              result,
+            Predicate                                                   pred);
 
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
+template <typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Predicate>
 __host__ __device__
-OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first,
-                       InputIterator1 last,
-                       InputIterator2 stencil,
-                       OutputIterator result,
-                       Predicate pred);
+    OutputIterator
+    copy_if(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+            InputIterator1                                              first,
+            InputIterator1                                              last,
+            InputIterator2                                              stencil,
+            OutputIterator                                              result,
+            Predicate                                                   pred);
+
+namespace cuda_cub {
+
+namespace __copy_if {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class, class>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm52, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+
+  template<class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 10,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+  template<class T>
+  struct Tuning<sm30, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<300>
+
+  struct no_stencil_tag_    {};
+  typedef no_stencil_tag_* no_stencil_tag;
+  template <class ItemsIt,
+            class StencilIt,
+            class OutputIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutputIt>
+  struct CopyIfAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type   item_type;
+    typedef typename iterator_traits<StencilIt>::value_type stencil_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch,item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type   ItemsLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, StencilIt>::type StencilLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type   BlockLoadItems;
+      typedef typename core::BlockLoad<PtxPlan, StencilLoadIt>::type BlockLoadStencil;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        } scan_storage;
+
+        typename BlockLoadItems::TempStorage   load_items;
+        typename BlockLoadStencil::TempStorage load_stencil;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE> raw_exchange;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
+    typedef typename ptx_plan::StencilLoadIt      StencilLoadIt;
+    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
+    typedef typename ptx_plan::BlockLoadStencil   BlockLoadStencil;
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan          BlockScan;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      USE_STENCIL      = !thrust::detail::is_same<StencilIt, no_stencil_tag>::value,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  storage;
+      ScanTileState &tile_state;
+      ItemsLoadIt    items_load_it;
+      StencilLoadIt  stencil_load_it;
+      OutputIt       output_it;
+      Predicate      predicate;
+      Size           num_items;
+
+      //------------------------------------------
+      // scatter results to memory
+      //------------------------------------------
+
+      THRUST_DEVICE_FUNCTION void
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  num_tile_selections,
+              Size num_selections_prefix)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            new (&storage.raw_exchange[local_scatter_offset]) item_type(items[ITEM]);
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          output_it[num_selections_prefix + item] = storage.raw_exchange[item];
+        }
+      }    // func scatter
+
+      //------------------------------------------
+      // specialize predicate on different types
+      //------------------------------------------
+
+      template <int T>
+      struct __tag {};
+
+      enum ItemStencil
+      {
+        ITEM,
+        STENCIL
+      };
+
+      template <bool TAG, class T>
+      struct wrap_value
+      {
+        T const &              x;
+        THRUST_DEVICE_FUNCTION wrap_value(T const &x) : x(x) {}
+
+        THRUST_DEVICE_FUNCTION T const &operator()() const { return x; };
+      };    // struct wrap_type
+
+      //------- item
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &x,
+                        __tag<false /* USE_STENCIL */>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+      //-------- stencil
+
+      template <class T>
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, T> const &x,
+                        __tag<true>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, no_stencil_tag_> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, stencil_type> const &,
+                        __tag<false>)
+      {
+        return false;
+      }
+
+      template <bool IS_LAST_TILE, ItemStencil TYPE, class T>
+      THRUST_DEVICE_FUNCTION void
+      compute_selection_flags(int num_tile_items,
+                              T (&values)[ITEMS_PER_THREAD],
+                              Size (&selection_flags)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Out-of-bounds items are selection_flags
+          selection_flags[ITEM] = 1;
+
+          if (!IS_LAST_TILE ||
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+          {
+            selection_flags[ITEM] =
+                predicate_wrapper(wrap_value<TYPE, T>(values[ITEM]),
+                                  __tag<USE_STENCIL>());
+          }
+        }
+      }
+
+      //------------------------------------------
+      // consume tiles
+      //------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE) {
+          BlockLoadItems(storage.load_items)
+              .Load(items_load_it + tile_base,
+                    items_loc,
+                    num_tile_items);
+        }
+        else
+        {
+          BlockLoadItems(storage.load_items)
+              .Load(items_load_it + tile_base,
+                    items_loc);
+        }
+
+        core::sync_threadblock();
 
+        if (USE_STENCIL)
+        {
+          stencil_type stencil_loc[ITEMS_PER_THREAD];
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+          if (IS_LAST_TILE)
+          {
+            BlockLoadStencil(storage.load_stencil)
+                .Load(stencil_load_it + tile_base,
+                      stencil_loc,
+                      num_tile_items);
+          }
+          else
+          {
+            BlockLoadStencil(storage.load_stencil)
+                .Load(stencil_load_it + tile_base,
+                      stencil_loc);
+          }
+
+          compute_selection_flags<IS_LAST_TILE, STENCIL>(num_tile_items,
+                                                         stencil_loc,
+                                                         selection_flags);
+        }
+        else /* Use predicate on items rather then stencil */
+        {
+          compute_selection_flags<IS_LAST_TILE, ITEM>(num_tile_items,
+                                                      items_loc,
+                                                      selection_flags);
+        }
+
+        core::sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       storage.scan_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        core::sync_threadblock();
+
+        scatter(items_loc,
+                selection_flags,
+                selection_idx,
+                num_tile_selections,
+                num_selections_prefix);
+
+
+        return num_selections;
+      }    // func consume_tile_impl
+
+      template <bool         IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION Size
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }    // func consume_tile
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &       storage_,
+                                  ScanTileState &     tile_state_,
+                                  ItemsIt             items_it,
+                                  StencilIt           stencil_it,
+                                  OutputIt            output_it_,
+                                  Predicate           predicate_,
+                                  Size                num_items_,
+                                  int                 num_tiles,
+                                  NumSelectedOutputIt num_selected_out)
+          : storage(storage_),
+            tile_state(tile_state_),
+            items_load_it(core::make_load_iterator(ptx_plan(), items_it)),
+            stencil_load_it(core::make_load_iterator(ptx_plan(), stencil_it)),
+            output_it(output_it_),
+            predicate(predicate_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }    // ctor impl
+    };
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt             items_it,
+                       StencilIt           stencil_it,
+                       OutputIt            output_it,
+                       Predicate           predicate,
+                       Size                num_items,
+                       NumSelectedOutputIt num_selected_out,
+                       ScanTileState       tile_state,
+                       int                 num_tiles,
+                       char *              shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           items_it,
+           stencil_it,
+           output_it,
+           predicate,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };    // struct CopyIfAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+  };    // struct InitAgent
+
+  template <class ItemsIt,
+            class StencilIt,
+            class OutputIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  THRUST_RUNTIME_FUNCTION
+  static cudaError_t doit_step(void *           d_temp_storage,
+                               size_t &         temp_storage_bytes,
+                               ItemsIt          items,
+                               StencilIt        stencil,
+                               OutputIt         output_it,
+                               Predicate        predicate,
+                               NumSelectedOutIt num_selected_out,
+                               Size             num_items,
+                               cudaStream_t     stream)
+  {
+    if (num_items == 0)
+      return cudaSuccess;
+
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        CopyIfAgent<ItemsIt,
+                    StencilIt,
+                    OutputIt,
+                    Predicate,
+                    Size,
+                    NumSelectedOutIt> >
+        copy_if_agent;
+
+    typedef typename copy_if_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type    init_plan    = init_agent::get_plan();
+    typename get_plan<copy_if_agent>::type copy_if_plan = copy_if_agent::get_plan(stream);
+
+    int tile_size = copy_if_plan.items_per_tile;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+    size_t vshmem_size = core::vshmem_size(copy_if_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return status;
+
+    size_t allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    void* allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "copy_if::init_agent");
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char*)allocations[1] : NULL;
+
+    copy_if_agent pa(copy_if_plan, num_items, stream, vshmem_ptr, "copy_if::partition_agent");
+
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    pa.launch(items,
+              stencil,
+              output_it,
+              predicate,
+              num_items,
+              num_selected_out,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename InputIt,
+            typename StencilIt,
+            typename OutputIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  OutputIt copy_if(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   StencilIt                  stencil,
+                   OutputIt                   output,
+                   Predicate                  predicate)
+  {
+    typedef int size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    if (num_items == 0)
+      return output;
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       output,
+                       predicate,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "copy_if failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "copy_if failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "copy_if failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       output,
+                       predicate,
+                       d_num_selected_out,
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "copy_if failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "copy_if failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return output + num_selected;
+  }
+
+}    // namespace __copy_if
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIterator,
+          class OutputIterator,
+          class Predicate>
+OutputIterator __host__ __device__
+copy_if(execution_policy<Derived> &policy,
+        InputIterator              first,
+        InputIterator              last,
+        OutputIterator             result,
+        Predicate                  pred)
+{
+  THRUST_CDP_DISPATCH((return __copy_if::copy_if(policy,
+                                                   first,
+                                                   last,
+                                                   __copy_if::no_stencil_tag(),
+                                                   result,
+                                                   pred);),
+                      (return
+                         thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         result,
+                                         pred);));
+} // func copy_if
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIterator,
+          class StencilIterator,
+          class OutputIterator,
+          class Predicate>
+OutputIterator __host__ __device__
+copy_if(execution_policy<Derived> &policy,
+        InputIterator              first,
+        InputIterator              last,
+        StencilIterator            stencil,
+        OutputIterator             result,
+        Predicate                  pred)
+{
+  THRUST_CDP_DISPATCH(
+    (return __copy_if::copy_if(policy, first, last, stencil, result, pred);),
+    (return thrust::copy_if(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              stencil,
+                              result,
+                              pred);));
+}    // func copy_if
 
-#include <thrust/system/cuda/detail/copy_if.inl>
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
 
+#include <thrust/copy.h>
+#endif
diff --git a/thrust/system/cuda/detail/copy_if.inl b/thrust/system/cuda/detail/copy_if.inl
deleted file mode 100644
index 9a95f72f6..000000000
--- a/thrust/system/cuda/detail/copy_if.inl
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/copy_if.h>
-#include <thrust/detail/seq.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/scan.h>
-#include <thrust/system/cuda/detail/default_decomposition.h>
-#include <thrust/system/cuda/detail/reduce_intervals.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/functional.h>
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace copy_if_detail
-{
-
-
-template<typename InputIterator1,
-         typename InputIterator2,
-         typename InputIterator3,
-         typename Decomposition,
-         typename OutputIterator,
-         typename Context>
-struct copy_if_intervals_closure
-{
-  InputIterator1 input;
-  InputIterator2 stencil;
-  InputIterator3 offsets;
-  Decomposition decomp;
-  OutputIterator output;
-
-  typedef Context context_type;
-  context_type context;
-  
-  __host__ __device__
-  copy_if_intervals_closure(InputIterator1 input,
-                            InputIterator2 stencil,
-                            InputIterator3 offsets,
-                            Decomposition decomp,
-                            OutputIterator output,
-                            Context context = Context())
-    : input(input), stencil(stencil), offsets(offsets), decomp(decomp), output(output), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef unsigned int PredicateType;
-    
-    const unsigned int CTA_SIZE = context_type::ThreadsPerBlock::value;
-
-    thrust::plus<PredicateType> binary_op;
-
-    __shared__ PredicateType sdata[CTA_SIZE];  context.barrier();
-    
-    typedef typename Decomposition::index_type IndexType;
-
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<IndexType> range = decomp[context.block_index()];
-
-    IndexType base = range.begin();
-
-    PredicateType predicate = 0;
-    
-    // advance input iterators to this thread's starting position
-    input   += base + context.thread_index();
-    stencil += base + context.thread_index();
-
-    // advance output to this interval's starting position
-    if(context.block_index() != 0)
-    {
-      InputIterator3 temp = offsets + (context.block_index() - 1);
-      output += *temp;
-    }
-
-    // process full blocks
-    while(base + CTA_SIZE <= range.end())
-    {
-      // read data
-      sdata[context.thread_index()] = predicate = *stencil;
-      
-      context.barrier();
-
-      // scan block
-      block::inclusive_scan(context, sdata, binary_op);
-      
-      // write data
-      if(predicate)
-      {
-        OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
-        *temp2 = *input;
-      }
-
-      // advance inputs by CTA_SIZE
-      base    += CTA_SIZE;
-      input   += CTA_SIZE;
-      stencil += CTA_SIZE;
-
-      // advance output by number of true predicates
-      output += sdata[CTA_SIZE - 1];
-
-      context.barrier();
-    }
-
-    // process partially full block at end of input (if necessary)
-    if(base < range.end())
-    {
-      // read data
-      if(base + context.thread_index() < range.end())
-      {
-        sdata[context.thread_index()] = predicate = *stencil;
-      }
-      else
-      {
-        sdata[context.thread_index()] = predicate = 0;
-      }
-      
-      context.barrier();
-
-      // scan block
-      block::inclusive_scan(context, sdata, binary_op);
-      
-      // write data
-      if(predicate) // expects predicate=false for >= interval_end
-      {
-        OutputIterator temp2 = output + (sdata[context.thread_index()] - 1);
-        *temp2 = *input;
-      }
-    }
-  }
-}; // copy_if_intervals_closure
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-__host__ __device__
-OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first,
-                       InputIterator1 last,
-                       InputIterator2 stencil,
-                       OutputIterator output,
-                       Predicate pred)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type IndexType;
-
-  if(first == last)
-  {
-    return output;
-  }
-
-  typedef thrust::system::detail::internal::uniform_decomposition<IndexType> Decomposition;
-  typedef thrust::detail::temporary_array<IndexType, DerivedPolicy>          IndexArray;
-
-  Decomposition decomp = default_decomposition(last - first);
-
-  // storage for per-block predicate counts
-  IndexArray block_results(exec, decomp.size());
-
-  // convert stencil into an iterator that produces integral values in {0,1}
-  typedef typename thrust::detail::predicate_to_integral<Predicate,IndexType>              PredicateToIndexTransform;
-  typedef thrust::transform_iterator<PredicateToIndexTransform, InputIterator2, IndexType> PredicateToIndexIterator;
-
-  PredicateToIndexIterator predicate_stencil(stencil, PredicateToIndexTransform(pred));
-
-  // compute number of true values in each interval
-  thrust::system::cuda::detail::reduce_intervals(exec, predicate_stencil, block_results.begin(), thrust::plus<IndexType>(), decomp);
-
-  // scan the partial sums
-  thrust::inclusive_scan(exec, block_results.begin(), block_results.end(), block_results.begin(), thrust::plus<IndexType>());
-
-  // copy values to output
-  const unsigned int ThreadsPerBlock = 256;
-  typedef typename IndexArray::iterator InputIterator3;
-  typedef detail::statically_blocked_thread_array<ThreadsPerBlock> Context;
-  typedef copy_if_intervals_closure<InputIterator1,PredicateToIndexIterator,InputIterator3,Decomposition,OutputIterator,Context> Closure;
-  Closure closure(first, predicate_stencil, block_results.begin(), decomp, output);
-  detail::launch_closure(exec, closure, decomp.size(), ThreadsPerBlock);
-
-  return output + block_results[decomp.size() - 1];
-} // end copy_if()
-
-
-} // end copy_if_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator,
-         typename Predicate>
-__host__ __device__
-OutputIterator copy_if(execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first,
-                       InputIterator1 last,
-                       InputIterator2 stencil,
-                       OutputIterator output,
-                       Predicate pred)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static OutputIterator parallel_path(execution_policy<DerivedPolicy> &exec,
-                                        InputIterator1 first,
-                                        InputIterator1 last,
-                                        InputIterator2 stencil,
-                                        OutputIterator output,
-                                        Predicate pred)
-    {
-      return thrust::system::cuda::detail::copy_if_detail::copy_if(exec, first, last, stencil, output, pred);
-    } // end parallel_path()
-
-    __host__ __device__
-    static OutputIterator sequential_path(execution_policy<DerivedPolicy> &,
-                                          InputIterator1 first,
-                                          InputIterator1 last,
-                                          InputIterator2 stencil,
-                                          OutputIterator output,
-                                          Predicate pred)
-    {
-      return thrust::copy_if(thrust::seq, first, last, stencil, output, pred);
-    } // end parallel_path()
-  }; // end workaround
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, last, stencil, output, pred);
-#else
-  return workaround::sequential_path(exec, first, last, stencil, output, pred);
-#endif
-} // end copy_if()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
diff --git a/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/system/cuda/detail/core/agent_launcher.h
new file mode 100644
index 000000000..dbb26f33f
--- /dev/null
+++ b/thrust/system/cuda/detail/core/agent_launcher.h
@@ -0,0 +1,1145 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <cub/detail/device_synchronize.cuh>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
+#include <thrust/system/cuda/detail/core/util.h>
+#include <cassert>
+
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+namespace core {
+
+
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+#if 0
+  template <class Agent, class... Args>
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+      _kernel_agent(Args... args)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(args..., shmem);
+  }
+#else
+  template <class Agent, class _0>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, shmem);
+  }
+  template <class Agent, class _0, class _1>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, shmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
+  {
+    extern __shared__ char shmem[];
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, shmem);
+  }
+#endif
+
+  ////////////////////////////////////////////////////////////
+
+
+#if 0
+  template <class Agent, class... Args>
+  void __global__
+  __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+      _kernel_agent_vshmem(char* vshmem, Args... args)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(args..., vshmem);
+  }
+#else
+  template <class Agent, class _0>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, vshmem);
+  }
+  template <class Agent, class _0, class _1>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, vshmem);
+  }
+  template <class Agent, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS)
+  _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE)
+  {
+    extern __shared__ char shmem[];
+    vshmem = vshmem == NULL ? shmem : vshmem + blockIdx.x * temp_storage_size<typename Agent::ptx_plan>::value;
+    Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE, vshmem);
+  }
+#endif
+#else
+#if 0
+  template <class , class... Args >
+  void __global__  _kernel_agent(Args... args) {}
+  template <class , class... Args >
+  void __global__  _kernel_agent_vshmem(char*, Args... args) {}
+#else
+  template <class, class _0>
+  void __global__ _kernel_agent(_0) {}
+  template <class, class _0, class _1>
+  void __global__ _kernel_agent(_0,_1) {}
+  template <class, class _0, class _1, class _2>
+  void __global__ _kernel_agent(_0,_1,_2) {}
+  template <class, class _0, class _1, class _2, class _3>
+  void __global__ _kernel_agent(_0,_1,_2,_3) {}
+  template <class, class _0, class _1, class _2, class _3, class _4>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {}
+  ////////////////////////////////////////////////////////////
+  template <class, class _0>
+  void __global__ _kernel_agent_vshmem(char*,_0) {}
+  template <class, class _0, class _1>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1) {}
+  template <class, class _0, class _1, class _2>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2) {}
+  template <class, class _0, class _1, class _2, class _3>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3) {}
+  template <class, class _0, class _1, class _2, class _3, class _4>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+  void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {}
+  template <class, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+  void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {}
+#endif
+#endif
+
+
+  template<class Agent>
+  struct AgentLauncher : Agent
+  {
+    core::AgentPlan plan;
+    size_t          count;
+    cudaStream_t    stream;
+    char const*     name;
+    unsigned int    grid;
+    char*           vshmem;
+    bool            has_shmem;
+    size_t          shmem_size;
+
+    enum
+    {
+      MAX_SHMEM_PER_BLOCK = 48 * 1024,
+    };
+    typedef
+        typename has_enough_shmem<Agent,
+                                  MAX_SHMEM_PER_BLOCK>::type has_enough_shmem_t;
+    typedef
+        has_enough_shmem<Agent,
+                                  MAX_SHMEM_PER_BLOCK> shm1;
+
+    template <class Size>
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  Size         count_,
+                  cudaStream_t stream_,
+                  char const*  name_)
+        : plan(plan_),
+          count((size_t)count_),
+          stream(stream_),
+          name(name_),
+          grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
+          vshmem(NULL),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(count > 0);
+    }
+
+    template <class Size>
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  Size         count_,
+                  cudaStream_t stream_,
+                  char*        vshmem,
+                  char const*  name_)
+        : plan(plan_),
+          count((size_t)count_),
+          stream(stream_),
+          name(name_),
+          grid(static_cast<unsigned int>((count + plan.items_per_tile - 1) / plan.items_per_tile)),
+          vshmem(vshmem),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(count > 0);
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  cudaStream_t stream_,
+                  char const*  name_)
+        : plan(plan_),
+          count(0),
+          stream(stream_),
+          name(name_),
+          grid(plan.grid_size),
+          vshmem(NULL),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(plan.grid_size > 0);
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    AgentLauncher(AgentPlan    plan_,
+                  cudaStream_t stream_,
+                  char*        vshmem,
+                  char const*  name_)
+        : plan(plan_),
+          count(0),
+          stream(stream_),
+          name(name_),
+          grid(plan.grid_size),
+          vshmem(vshmem),
+          has_shmem((size_t)core::get_max_shared_memory_per_block() >= (size_t)plan.shared_memory_size),
+          shmem_size(has_shmem ? plan.shared_memory_size : 0)
+    {
+      assert(plan.grid_size > 0);
+    }
+
+#if 0
+    THRUST_RUNTIME_FUNCTION
+    AgentPlan static get_plan(cudaStream_t s, void* d_ptr = 0)
+    {
+      // in separable compilation mode, we have no choice
+      // but to call kernel to get agent_plan
+      // otherwise the risk is something may fail
+      // if user mix & match ptx versions in a separably compiled function
+      // http://nvbugs/1772071
+      // XXX may be it is too string of a requirements, consider relaxing it in
+      // the future
+#ifdef __CUDACC_RDC__
+      return core::get_agent_plan<Agent>(s, d_ptr);
+#else
+      return get_agent_plan<Agent>(core::get_ptx_version());
+#endif
+    }
+    THRUST_RUNTIME_FUNCTION
+    AgentPlan static get_plan_default()
+    {
+      return get_agent_plan<Agent>(sm_arch<0>::type::ver);
+    }
+#endif
+
+    THRUST_RUNTIME_FUNCTION
+    typename core::get_plan<Agent>::type static get_plan(cudaStream_t , void* d_ptr = 0)
+    {
+      THRUST_UNUSED_VAR(d_ptr);
+      return get_agent_plan<Agent>(core::get_ptx_version());
+    }
+
+    THRUST_RUNTIME_FUNCTION
+    typename core::get_plan<Agent>::type static get_plan()
+    {
+      return get_agent_plan<Agent>(lowest_supported_sm_arch::ver);
+    }
+
+    THRUST_RUNTIME_FUNCTION void sync() const
+    {
+      CubDebug(cub::detail::DebugSyncStream(stream));
+    }
+
+    template<class K>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    max_blocks_per_sm_impl(K k, int block_threads)
+    {
+      int occ;
+      cudaError_t status = cub::MaxSmOccupancy(occ, k, block_threads);
+      return cuda_optional<int>(status == cudaSuccess ? occ : -1, status);
+    }
+
+    template <class K>
+    cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    max_sm_occupancy(K k) const
+    {
+      return max_blocks_per_sm_impl(k, plan.block_threads);
+    }
+
+    template<class K>
+    THRUST_RUNTIME_FUNCTION
+    void print_info(K k) const
+    {
+      #if THRUST_DEBUG_SYNC_FLAG 
+      cuda_optional<int> occ = max_sm_occupancy(k);
+      const int ptx_version = core::get_ptx_version();
+      if (count > 0)
+      {
+        _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %llu items total, %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version \n",
+                name,
+                grid,
+                plan.block_threads,
+                (has_shmem ? (int)plan.shared_memory_size : 0),
+                (long long)stream,
+                (long long)count,
+                plan.items_per_thread,
+                (int)occ,
+                (!has_shmem ? (int)plan.shared_memory_size : 0),
+                (int)ptx_version);
+      }
+      else
+      {
+        _CubLog("Invoking %s<<<%u, %d, %d, %lld>>>(), %d items per thread, %d SM occupancy, %d vshmem size, %d ptx_version\n",
+                name,
+                grid,
+                plan.block_threads,
+                (has_shmem ? (int)plan.shared_memory_size : 0),
+                (long long)stream,
+                plan.items_per_thread,
+                (int)occ,
+                (!has_shmem ? (int)plan.shared_memory_size : 0),
+                (int)ptx_version);
+      }
+      #else
+      (void)k;
+      #endif
+    }
+
+    ////////////////////
+    //  Variadic code
+    ////////////////////
+
+#if 0
+    template<class... Args>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      return max_blocks_per_sm_impl(_kernel_agent<Agent, Args...>, plan.block_threads);
+    }
+#else
+    template<class _0>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0) = _kernel_agent<Agent, _0>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0, _1) = _kernel_agent<Agent, _0, _1>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2) = _kernel_agent<Agent, _0, _1, _2>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3) = _kernel_agent<Agent, _0, _1, _2,_3>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4) = _kernel_agent<Agent, _0, _1, _2,_3,_4>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+    template<class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    static cuda_optional<int> THRUST_RUNTIME_FUNCTION
+    get_max_blocks_per_sm(AgentPlan plan)
+    {
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE>;
+      return max_blocks_per_sm_impl(ptr, plan.block_threads);
+    }
+#endif
+
+
+
+#if 0
+
+    // If we are guaranteed to have enough shared memory
+    // don't compile other kernel which accepts pointer
+    // and save on compilations
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, Args... args) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      print_info(_kernel_agent<Agent, Args...>);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(_kernel_agent<Agent, Args...>, args...);
+    }
+
+    // If there is a risk of not having enough shared memory
+    // we compile generic kernel instead.
+    // This kernel is likely to be somewhat slower, but it can accomodate
+    // both shared and virtualized shared memories.
+    // Alternative option is to compile two kernels, one using shared and one
+    // using virtualized shared memory. While this can be slightly faster if we
+    // do actually have enough shared memory, the compilation time will double.
+    //
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, Args... args) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      print_info(_kernel_agent_vshmem<Agent, Args...>);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(_kernel_agent_vshmem<Agent, Args...>, vshmem, args...);
+    }
+
+    template <class... Args>
+    void THRUST_RUNTIME_FUNCTION
+    launch(Args... args) const
+    {
+      launch_impl(has_enough_shmem_t(),args...);
+      sync();
+    }
+#else
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0) = _kernel_agent_vshmem<Agent, _0>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0);
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1) = _kernel_agent_vshmem<Agent, _0, _1>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1);
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2) = _kernel_agent_vshmem<Agent, _0, _1, _2>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2);
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3);
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8>;
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::false_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9,_xA xA,_xB xB,_xC xC,_xD xD,_xE xE) const
+    {
+      assert((has_shmem && vshmem == NULL) || (!has_shmem && vshmem != NULL && shmem_size == 0));
+      void (*ptr)(char*, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) = _kernel_agent_vshmem<Agent, _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, shmem_size, stream)
+          .doit(ptr, vshmem, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0) = _kernel_agent<Agent, _0>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0);
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0, _1) = _kernel_agent<Agent, _0, _1>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1);
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2) = _kernel_agent<Agent, _0, _1, _2>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2);
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3) = _kernel_agent<Agent, _0, _1, _2,_3>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3);
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4) = _kernel_agent<Agent, _0, _1, _2,_3,_4>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch_impl(thrust::detail::true_type, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      assert(has_shmem && vshmem == NULL);
+      void (*ptr)(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE) = _kernel_agent<Agent, _0, _1, _2,_3,_4,_5,_6,_7,_8,_9,_xA,_xB,_xC,_xD,_xE>;
+      print_info(ptr);
+      launcher::triple_chevron(grid, plan.block_threads, plan.shared_memory_size, stream)
+          .doit(ptr,x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+    }
+
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+    ////////////////////////////////////////////////////////
+
+    template <class _0>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0) const
+    {
+      launch_impl(has_enough_shmem_t(), x0);
+      sync();
+    }
+    template <class _0, class _1>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1);
+      sync();
+    }
+    template <class _0, class _1, class _2>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD);
+      sync();
+    }
+    template <class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9, class _xA, class _xB, class _xC, class _xD, class _xE>
+    void THRUST_RUNTIME_FUNCTION
+    launch(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) const
+    {
+      launch_impl(has_enough_shmem_t(), x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, xE);
+      sync();
+    }
+#endif
+
+
+  };
+
+}    // namespace core
+}
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/core/alignment.h b/thrust/system/cuda/detail/core/alignment.h
new file mode 100644
index 000000000..4b807ebc1
--- /dev/null
+++ b/thrust/system/cuda/detail/core/alignment.h
@@ -0,0 +1,250 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// TODO: This can probably be removed.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+namespace alignment_of_detail {
+
+
+  template <typename T>
+  class alignment_of_impl;
+
+  template <typename T, std::size_t size_diff>
+  struct helper
+  {
+    static const std::size_t value = size_diff;
+  };
+
+  template <typename T>
+  class helper<T, 0>
+  {
+  public:
+    static const std::size_t value = alignment_of_impl<T>::value;
+  };
+
+  template <typename T>
+  class alignment_of_impl
+  {
+  private:
+    struct big
+    {
+      T    x;
+      char c;
+    };
+
+  public:
+    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
+  };
+
+
+}    // end alignment_of_detail
+
+
+template <typename T>
+struct alignment_of
+    : alignment_of_detail::alignment_of_impl<T>
+{
+};
+
+
+template <std::size_t Align>
+struct aligned_type;
+
+// __align__ is CUDA-specific, so guard it
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+// implementing aligned_type portably is tricky:
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// implement aligned_type with specialization because MSVC
+// requires literals as arguments to declspec(align(n))
+template <>
+struct aligned_type<1>
+{
+  struct __align__(1) type{};
+};
+
+template <>
+struct aligned_type<2>
+{
+  struct __align__(2) type{};
+};
+
+template <>
+struct aligned_type<4>
+{
+  struct __align__(4) type{};
+};
+
+template <>
+struct aligned_type<8>
+{
+  struct __align__(8) type{};
+};
+
+template <>
+struct aligned_type<16>
+{
+  struct __align__(16) type{};
+};
+
+template <>
+struct aligned_type<32>
+{
+  struct __align__(32) type{};
+};
+
+template <>
+struct aligned_type<64>
+{
+  struct __align__(64) type{};
+};
+
+template <>
+struct aligned_type<128>
+{
+  struct __align__(128) type{};
+};
+
+template <>
+struct aligned_type<256>
+{
+  struct __align__(256) type{};
+};
+
+template <>
+struct aligned_type<512>
+{
+  struct __align__(512) type{};
+};
+
+template <>
+struct aligned_type<1024>
+{
+  struct __align__(1024) type{};
+};
+
+template <>
+struct aligned_type<2048>
+{
+  struct __align__(2048) type{};
+};
+
+template <>
+struct aligned_type<4096>
+{
+  struct __align__(4096) type{};
+};
+
+template <>
+struct aligned_type<8192>
+{
+  struct __align__(8192) type{};
+};
+#elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
+// implement aligned_type with specialization because gcc 4.2
+// requires literals as arguments to __attribute__(aligned(n))
+template <>
+struct aligned_type<1>
+{
+  struct __align__(1) type{};
+};
+
+template <>
+struct aligned_type<2>
+{
+  struct __align__(2) type{};
+};
+
+template <>
+struct aligned_type<4>
+{
+  struct __align__(4) type{};
+};
+
+template <>
+struct aligned_type<8>
+{
+  struct __align__(8) type{};
+};
+
+template <>
+struct aligned_type<16>
+{
+  struct __align__(16) type{};
+};
+
+template <>
+struct aligned_type<32>
+{
+  struct __align__(32) type{};
+};
+
+template <>
+struct aligned_type<64>
+{
+  struct __align__(64) type{};
+};
+
+template <>
+struct aligned_type<128>
+{
+  struct __align__(128) type{};
+};
+
+#else
+// assume the compiler allows template parameters as
+// arguments to __align__
+template <std::size_t Align>
+struct aligned_type
+{
+  struct __align__(Align) type{};
+};
+#endif    // THRUST_HOST_COMPILER
+#else
+template <std::size_t Align>
+struct aligned_type
+{
+  struct type
+  {
+  };
+};
+#endif    // THRUST_DEVICE_COMPILER
+
+
+template <std::size_t Len, std::size_t Align>
+struct aligned_storage
+{
+  union type
+  {
+    unsigned char data[Len];
+
+    typename aligned_type<Align>::type align;
+  };
+};
+
+
+}    // end cuda_
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/system/cuda/detail/core/triple_chevron_launch.h
new file mode 100644
index 000000000..65a7283b7
--- /dev/null
+++ b/thrust/system/cuda/detail/core/triple_chevron_launch.h
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/core/alignment.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <cassert>
+
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+namespace launcher {
+
+  struct triple_chevron
+  {
+    typedef size_t Size;
+    dim3 const grid;
+    dim3 const block;
+    Size const shared_mem;
+    cudaStream_t const stream;
+
+    THRUST_RUNTIME_FUNCTION
+    triple_chevron(dim3         grid_,
+                   dim3         block_,
+                   Size         shared_mem_ = 0,
+                   cudaStream_t stream_     = 0)
+        : grid(grid_),
+          block(block_),
+          shared_mem(shared_mem_),
+          stream(stream_) {}
+
+    template<class K, class... Args>
+    cudaError_t __host__
+    doit_host(K k, Args const&... args) const
+    {
+      k<<<grid, block, shared_mem, stream>>>(args...);
+      return cudaPeekAtLastError();
+    }
+
+    template<class T>
+    size_t __device__
+    align_up(size_t offset) const
+    {
+      size_t alignment = alignment_of<T>::value;
+      return alignment * ((offset + (alignment - 1))/ alignment);
+    }
+
+    size_t __device__ argument_pack_size(size_t size) const { return size; }
+    template <class Arg, class... Args>
+    size_t __device__
+    argument_pack_size(size_t size, Arg const& arg, Args const&... args) const
+    {
+      size = align_up<Arg>(size);
+      return argument_pack_size(size + sizeof(Arg), args...);
+    }
+
+    template <class Arg>
+    size_t __device__ copy_arg(char* buffer, size_t offset, Arg arg) const
+    {
+      offset = align_up<Arg>(offset);
+      for (int i = 0; i != sizeof(Arg); ++i)
+        buffer[offset+i] = *((char*)&arg + i);
+      return offset + sizeof(Arg);
+    }
+
+    __device__
+    void fill_arguments(char*, size_t) const
+    {}
+
+    template<class Arg, class... Args>
+    __device__
+    void fill_arguments(char* buffer,
+                     size_t offset,
+                     Arg const& arg,
+                     Args const& ... args) const
+    {
+      fill_arguments(buffer, copy_arg(buffer, offset, arg), args...);
+    }
+
+    #ifdef THRUST_RDC_ENABLED
+    template<class K, class... Args>
+    cudaError_t __device__
+    doit_device(K k, Args const&... args) const
+    {
+      const size_t size = argument_pack_size(0,args...);
+      void *param_buffer = cudaGetParameterBuffer(64,size);
+      fill_arguments((char*)param_buffer, 0, args...);
+      return launch_device(k, param_buffer);
+    }
+
+    template <class K>
+    cudaError_t __device__
+    launch_device(K k, void* buffer) const
+    {
+      return cudaLaunchDevice((void*)k,
+                              buffer,
+                              dim3(grid),
+                              dim3(block),
+                              shared_mem,
+                              stream);
+    }
+    #else 
+    template<class K, class... Args>
+    cudaError_t __device__
+    doit_device(K, Args const&... ) const
+    {
+      return cudaErrorNotSupported;
+    }
+    #endif
+
+    __thrust_exec_check_disable__
+    template <class K, class... Args>
+    THRUST_FUNCTION
+    cudaError_t doit(K k, Args const&... args) const
+    {
+      NV_IF_TARGET(NV_IS_HOST,
+                   (return doit_host(k, args...);),
+                   (return doit_device(k, args...);));
+    }
+
+  }; // struct triple_chevron
+
+}    // namespace launcher
+}    // namespace cuda_
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/core/util.h b/thrust/system/cuda/detail/core/util.h
new file mode 100644
index 000000000..e2f5f8299
--- /dev/null
+++ b/thrust/system/cuda/detail/core/util.h
@@ -0,0 +1,802 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/system_error.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_scan.cuh>
+#include <cub/block/block_store.cuh>
+
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+namespace core {
+
+#ifdef _NVHPC_CUDA
+#  if (__NVCOMPILER_CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__NVCOMPILER_CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  else
+#    define THRUST_TUNING_ARCH sm30
+#  endif
+#else
+#  if (__CUDA_ARCH__ >= 600)
+#    define THRUST_TUNING_ARCH sm60
+#  elif (__CUDA_ARCH__ >= 520)
+#    define THRUST_TUNING_ARCH sm52
+#  elif (__CUDA_ARCH__ >= 350)
+#    define THRUST_TUNING_ARCH sm35
+#  elif (__CUDA_ARCH__ >= 300)
+#    define THRUST_TUNING_ARCH sm30
+#  elif !defined (__CUDA_ARCH__)
+#    define THRUST_TUNING_ARCH sm30
+#  endif
+#endif
+
+  // Typelist - a container of types, supports up to 10 types
+  // --------------------------------------------------------------------------
+
+  class _;
+  template <class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _, class = _>
+  struct typelist;
+
+  // -------------------------------------
+
+  // supported SM arch
+  // ---------------------
+  struct sm30  { enum { ver = 300, warpSize = 32 }; };
+  struct sm35  { enum { ver = 350, warpSize = 32 }; };
+  struct sm52  { enum { ver = 520, warpSize = 32 }; };
+  struct sm60  { enum { ver = 600, warpSize = 32 }; };
+
+  // list of sm, checked from left to right order
+  // the rightmost is the lowest sm arch supported
+  // --------------------------------------------
+  typedef typelist<sm60,sm52,sm35,sm30> sm_list;
+
+  // lowest supported SM arch
+  // --------------------------------------------------------------------------
+
+  template<class, class>
+  struct lowest_supported_sm_arch_impl;
+
+  template <class SM, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct lowest_supported_sm_arch_impl<SM, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : lowest_supported_sm_arch_impl<_0, typelist<    _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+  template <class SM>
+  struct lowest_supported_sm_arch_impl<SM, typelist<> >
+  {
+    typedef SM type;
+  };
+
+  typedef typename lowest_supported_sm_arch_impl<_,sm_list>::type lowest_supported_sm_arch;
+
+  // metafunction to match next viable PtxPlan specialization
+  // --------------------------------------------------------------------------
+
+  __THRUST_DEFINE_HAS_NESTED_TYPE(has_tuning_t, tuning)
+  __THRUST_DEFINE_HAS_NESTED_TYPE(has_type_t, type)
+
+  template <template <class> class, class, class>
+  struct specialize_plan_impl_loop;
+  template <template <class> class, class>
+  struct specialize_plan_impl_match;
+
+  // we loop through the sm_list
+  template <template <class> class P, class SM, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_loop<P, SM, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : specialize_plan_impl_loop<P, SM, typelist<    _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+
+  // until we find first lowest match
+  template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_loop <P, SM,  typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+       : specialize_plan_impl_match<P,      typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> > {};
+
+  template<class, class>
+  struct has_sm_tuning_impl;
+
+  // specializing for Tunig which needs 1 arg
+  template <class SM,
+            template <class, class> class Tuning,
+            class _0>
+  struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0> > : has_type_t<Tuning<SM, _0> > {};
+
+  // specializing for Tunig which needs 2 args
+  template <class SM,
+            template <class, class,class> class Tuning,
+            class _0, class _1>
+  struct has_sm_tuning_impl<SM, Tuning<lowest_supported_sm_arch, _0, _1> > : has_type_t<Tuning<SM, _0, _1> > {};
+
+  template <template <class> class P, class SM>
+  struct has_sm_tuning : has_sm_tuning_impl<SM, typename P<lowest_supported_sm_arch>::tuning > {};
+
+  // once first match is found in sm_list, all remaining sm are possible
+  // candidate for tuning, so pick the first available
+  //   if the plan P has SM-level tuning then pick it,
+  //   otherwise move on to the next sm in the sm_list
+  template <template <class> class P, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+  struct specialize_plan_impl_match<P, typelist<SM, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+      : thrust::detail::conditional<
+            has_sm_tuning<P, SM>::value,
+            P<SM>,
+            specialize_plan_impl_match<P, typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> > >::type {};
+
+    template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
+    struct specialize_plan_msvc10_war
+    {
+      // if Plan has tuning type, this means it has SM-specific tuning
+      // so loop through sm_list to find match,
+      // otherwise just specialize on provided SM
+      typedef thrust::detail::conditional<has_tuning_t<Plan<lowest_supported_sm_arch> >::value,
+                                  specialize_plan_impl_loop<Plan, SM, sm_list>,
+                                  Plan<SM> >
+          type;
+    };
+
+    template <template <class> class Plan, class SM = THRUST_TUNING_ARCH>
+    struct specialize_plan : specialize_plan_msvc10_war<Plan,SM>::type::type {};
+
+
+    /////////////////////////
+    /////////////////////////
+    /////////////////////////
+
+    // retrieve temp storage size from an Agent
+    // ---------------------------------------------------------------------------
+    // metafunction introspects Agent, and if it finds TempStorage type
+    // it will return its size
+
+    __THRUST_DEFINE_HAS_NESTED_TYPE(has_temp_storage, TempStorage)
+
+    template <class Agent, class U>
+    struct temp_storage_size_impl;
+
+    template <class Agent>
+    struct temp_storage_size_impl<Agent, thrust::detail::false_type>
+    {
+      enum
+      {
+        value = 0
+      };
+    };
+
+    template <class Agent>
+    struct temp_storage_size_impl<Agent, thrust::detail::true_type>
+    {
+      enum
+      {
+        value = sizeof(typename Agent::TempStorage)
+      };
+    };
+
+    template <class Agent>
+    struct temp_storage_size
+        : temp_storage_size_impl<Agent, typename has_temp_storage<Agent>::type>
+    {
+    };
+
+    // check whether all Agents requires < MAX_SHMEM shared memory
+    // ---------------------------------------------------------------------------
+    // if so, we can use simpler kernel for dispatch, which assumes that all
+    // shared memory is on chip.
+    // Otherwise, a kernel will be compiled which can also accept virtualized
+    // shared memory, in case there is not enough on chip. This kernel is about
+    // 10% slower
+
+    template <bool, class, size_t, class>
+    struct has_enough_shmem_impl;
+
+    template <bool V, class A, size_t S, class _0, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    struct has_enough_shmem_impl<V, A, S, typelist<_0, _1, _2, _3, _4, _5, _6, _7, _8, _9> >
+        : has_enough_shmem_impl<
+              V && (temp_storage_size<specialize_plan<A::template PtxPlan, _0> >::value <= S),
+              A,
+              S,
+              typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> >
+    {
+    };
+    template <bool V, class A, size_t S>
+    struct has_enough_shmem_impl<V, A, S, typelist<> >
+    {
+      enum
+      {
+        value = V
+      };
+      typedef typename thrust::detail::conditional<value,
+                                           thrust::detail::true_type,
+                                           thrust::detail::false_type>::type type;
+    };
+
+    template <class Agent, size_t MAX_SHMEM>
+    struct has_enough_shmem : has_enough_shmem_impl<true, Agent, MAX_SHMEM, sm_list>
+    {
+    };
+
+    /////////////////////////
+    /////////////////////////
+    /////////////////////////
+
+    // AgentPlan structure and helpers
+    // --------------------------------
+
+    struct AgentPlan
+    {
+      int block_threads;
+      int items_per_thread;
+      int items_per_tile;
+      int shared_memory_size;
+      int grid_size;
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan() {}
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(int block_threads_,
+                int items_per_thread_,
+                int shared_memory_size_,
+                int grid_size_ = 0)
+          : block_threads(block_threads_),
+            items_per_thread(items_per_thread_),
+            items_per_tile(items_per_thread * block_threads),
+            shared_memory_size(shared_memory_size_),
+            grid_size(grid_size_)
+      {
+      }
+
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(AgentPlan const& plan)
+          : block_threads(plan.block_threads),
+            items_per_thread(plan.items_per_thread),
+            items_per_tile(plan.items_per_tile),
+            shared_memory_size(plan.shared_memory_size),
+            grid_size(plan.grid_size) {}
+
+      template <class PtxPlan>
+      THRUST_RUNTIME_FUNCTION
+      AgentPlan(PtxPlan,
+                typename thrust::detail::disable_if_convertible<
+                    PtxPlan,
+                    AgentPlan>::type* = NULL)
+          : block_threads(PtxPlan::BLOCK_THREADS),
+            items_per_thread(PtxPlan::ITEMS_PER_THREAD),
+            items_per_tile(PtxPlan::ITEMS_PER_TILE),
+            shared_memory_size(temp_storage_size<PtxPlan>::value),
+            grid_size(0)
+      {
+      }
+    };    // struct AgentPlan
+
+
+    __THRUST_DEFINE_HAS_NESTED_TYPE(has_Plan, Plan)
+
+    template <class Agent>
+    struct return_Plan
+    {
+      typedef typename Agent::Plan type;
+    };
+
+    template <class Agent>
+    struct get_plan : thrust::detail::conditional<
+                          has_Plan<Agent>::value,
+                          return_Plan<Agent>,
+                          thrust::detail::identity_<AgentPlan> >::type
+    {
+    };
+
+    // returns AgentPlan corresponding to a given ptx version
+    // ------------------------------------------------------
+
+    template<class, class>
+    struct get_agent_plan_impl;
+
+    template<class Agent, class SM, class _1, class _2, class _3, class _4, class _5, class _6, class _7, class _8, class _9>
+    struct get_agent_plan_impl<Agent,typelist<SM,_1,_2,_3,_4,_5,_6,_7,_8,_9> >
+    {
+      typedef typename get_plan<Agent>::type Plan;
+      Plan THRUST_RUNTIME_FUNCTION
+      static get(int ptx_version)
+      {
+        if (ptx_version >= SM::ver)
+          return Plan(specialize_plan<Agent::template PtxPlan, SM>());
+        else
+          return get_agent_plan_impl<Agent,
+                                     typelist<_1, _2, _3, _4, _5, _6, _7, _8, _9> >::
+              get(ptx_version);
+      }
+    };
+
+    template<class Agent>
+    struct get_agent_plan_impl<Agent,typelist<lowest_supported_sm_arch> >
+    {
+      typedef typename get_plan<Agent>::type Plan;
+      Plan THRUST_RUNTIME_FUNCTION
+      static get(int /* ptx_version */)
+      {
+        typedef typename get_plan<Agent>::type Plan;
+        return Plan(specialize_plan<Agent::template PtxPlan, lowest_supported_sm_arch>());
+      }
+    };
+
+    template <class Agent>
+    THRUST_RUNTIME_FUNCTION
+    typename get_plan<Agent>::type get_agent_plan(int ptx_version)
+    {
+      NV_IF_TARGET(
+        NV_IS_DEVICE,
+        (
+          THRUST_UNUSED_VAR(ptx_version);
+          using plan_type = typename get_plan<Agent>::type;
+          using ptx_plan  = typename Agent::ptx_plan;
+          return plan_type{ptx_plan{}};
+        ), // NV_IS_HOST:
+        ( return get_agent_plan_impl<Agent, sm_list>::get(ptx_version); ));
+    }
+
+// XXX keep this dead-code for now as a gentle reminder
+//     that kernel luunch which reats plan values is the most robust
+//     mechanism to extract sm-specific tuning parameters
+// TODO: since we are unable to afford kernel launch + cudaMemcpy ON EVERY
+//       algorithm invocation, we need to design a good caching strategy
+//       such that when the algorithm is called multiple times, only the
+//       first invocation will invoke kernel launch + cudaMemcpy, but
+//       the subsequent invocations, will just read cached values from host mem
+//       If launched from device, this is just a device-function call
+//       no caching is required.
+// ----------------------------------------------------------------------------
+  // if we don't know ptx version, we can call kernel
+  // to retrieve AgentPlan from device code. Slower, but guaranteed to work
+  // -----------------------------------------------------------------------
+#if 0
+  template<class Agent>
+  void __global__ get_agent_plan_kernel(AgentPlan *plan);
+
+  static __device__ AgentPlan agent_plan_device;
+
+  template<class Agent>
+  AgentPlan __device__ get_agent_plan_dev()
+  {
+    AgentPlan plan;
+    plan.block_threads      = Agent::ptx_plan::BLOCK_THREADS;
+    plan.items_per_thread   = Agent::ptx_plan::ITEMS_PER_THREAD;
+    plan.items_per_tile     = Agent::ptx_plan::ITEMS_PER_TILE;
+    plan.shared_memory_size = temp_storage_size<typename Agent::ptx_plan>::value;
+    return plan;
+  }
+
+  template <class Agent, class F>
+  AgentPlan __host__ __device__ __forceinline__
+  xget_agent_plan_impl(F f, cudaStream_t s, void* d_ptr)
+  {
+    AgentPlan plan;
+#ifdef __CUDA_ARCH__
+    plan = get_agent_plan_dev<Agent>();
+#else
+    static cub::Mutex mutex;
+    bool lock = false;
+    if (d_ptr == 0)
+    {
+      lock = true;
+      cudaGetSymbolAddress(&d_ptr, agent_plan_device);
+    }
+    if (lock)
+      mutex.Lock();
+    f<<<1,1,0,s>>>((AgentPlan*)d_ptr);
+    cudaMemcpyAsync((void*)&plan,
+                    d_ptr,
+                    sizeof(AgentPlan),
+                    cudaMemcpyDeviceToHost,
+                    s);
+    if (lock)
+      mutex.Unlock();
+    cudaStreamSynchronize(s);
+#endif
+    return plan;
+  }
+
+  template <class Agent>
+  AgentPlan THRUST_RUNTIME_FUNCTION
+  get_agent_plan(cudaStream_t s = 0, void *ptr = 0)
+  {
+    return xget_agent_plan_impl<Agent>(get_agent_plan_kernel<Agent>,
+                                        s,
+                                        ptr);
+  }
+
+  template<class Agent>
+  void __global__ get_agent_plan_kernel(AgentPlan *plan)
+  {
+    *plan = get_agent_plan_dev<Agent>();
+  }
+#endif
+
+  /////////////////////////
+  /////////////////////////
+  /////////////////////////
+
+  THRUST_RUNTIME_FUNCTION
+  inline int get_sm_count()
+  {
+    int dev_id;
+    cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
+                             "get_sm_count :"
+                             "failed to cudaGetDevice");
+
+    cudaError_t status;
+    int         i32value;
+    status = cudaDeviceGetAttribute(&i32value,
+                                    cudaDevAttrMultiProcessorCount,
+                                    dev_id);
+    cuda_cub::throw_on_error(status,
+                             "get_sm_count:"
+                             "failed to sm_count");
+    return i32value;
+  }
+
+  THRUST_RUNTIME_FUNCTION
+  inline size_t get_max_shared_memory_per_block()
+  {
+    int dev_id;
+    cuda_cub::throw_on_error(cudaGetDevice(&dev_id),
+                             "get_max_shared_memory_per_block :"
+                             "failed to cudaGetDevice");
+
+    cudaError_t status;
+    int         i32value;
+    status = cudaDeviceGetAttribute(&i32value,
+                                    cudaDevAttrMaxSharedMemoryPerBlock,
+                                    dev_id);
+    cuda_cub::throw_on_error(status,
+                             "get_max_shared_memory_per_block :"
+                             "failed to get max shared memory per block");
+
+    return static_cast<size_t>(i32value);
+  }
+
+  THRUST_RUNTIME_FUNCTION
+  inline size_t virtual_shmem_size(size_t shmem_per_block)
+  {
+    size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+    if (shmem_per_block > max_shmem_per_block)
+      return shmem_per_block;
+    else
+      return 0;
+  }
+
+  THRUST_RUNTIME_FUNCTION
+  inline size_t vshmem_size(size_t shmem_per_block, size_t num_blocks)
+  {
+    size_t max_shmem_per_block = core::get_max_shared_memory_per_block();
+    if (shmem_per_block > max_shmem_per_block)
+      return shmem_per_block*num_blocks;
+    else
+      return 0;
+  }
+
+  // LoadIterator
+  // ------------
+  // if trivial iterator is passed, wrap loads into LDG
+  //
+  template <class PtxPlan, class It>
+  struct LoadIterator
+  {
+    typedef typename iterator_traits<It>::value_type      value_type;
+    typedef typename iterator_traits<It>::difference_type size_type;
+
+    typedef typename thrust::detail::conditional<
+        is_contiguous_iterator<It>::value,
+        cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER,
+                                        value_type,
+                                        size_type>,
+                                        It>::type type;
+  };    // struct Iterator
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator_impl(It it, thrust::detail::true_type /* is_trivial */)
+  {
+    return raw_pointer_cast(&*it);
+  }
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator_impl(It it, thrust::detail::false_type /* is_trivial */)
+  {
+    return it;
+  }
+
+  template <class PtxPlan, class It>
+  typename LoadIterator<PtxPlan, It>::type __device__ __forceinline__
+  make_load_iterator(PtxPlan const&, It it)
+  {
+    return make_load_iterator_impl<PtxPlan>(
+        it, typename is_contiguous_iterator<It>::type());
+  }
+
+  template<class>
+  struct get_arch;
+
+  template<template<class> class Plan, class Arch>
+  struct get_arch<Plan<Arch> > { typedef Arch type; };
+
+  // BlockLoad
+  // -----------
+  // a helper metaprogram that returns type of a block loader
+  template <class PtxPlan,
+            class It,
+            class T    = typename iterator_traits<It>::value_type>
+  struct BlockLoad
+  {
+    using type = cub::BlockLoad<T,
+                                PtxPlan::BLOCK_THREADS,
+                                PtxPlan::ITEMS_PER_THREAD,
+                                PtxPlan::LOAD_ALGORITHM,
+                                1,
+                                1,
+                                get_arch<PtxPlan>::type::ver>;
+  };
+
+  // BlockStore
+  // -----------
+  // a helper metaprogram that returns type of a block loader
+  template <class PtxPlan,
+            class It,
+            class T = typename iterator_traits<It>::value_type>
+  struct BlockStore
+  {
+    using type = cub::BlockStore<T,
+                                 PtxPlan::BLOCK_THREADS,
+                                 PtxPlan::ITEMS_PER_THREAD,
+                                 PtxPlan::STORE_ALGORITHM,
+                                 1,
+                                 1,
+                                 get_arch<PtxPlan>::type::ver>;
+  };
+
+  // cuda_optional
+  // --------------
+  // used for function that return cudaError_t along with the result
+  //
+  template <class T>
+  class cuda_optional
+  {
+    cudaError_t status_{cudaSuccess};
+    T           value_{};
+
+  public:
+    cuda_optional() = default;
+
+    __host__ __device__
+    cuda_optional(T v, cudaError_t status = cudaSuccess) : status_(status), value_(v) {}
+
+    bool __host__ __device__
+    isValid() const { return cudaSuccess == status_; }
+
+    cudaError_t __host__ __device__
+    status() const { return status_; }
+
+    __host__ __device__ T const &
+    value() const { return value_; }
+
+    __host__ __device__ operator T const &() const { return value_; }
+  };
+
+  THRUST_RUNTIME_FUNCTION
+  inline int get_ptx_version()
+  {
+    int ptx_version = 0;
+    if (cub::PtxVersion(ptx_version) != cudaSuccess) 
+    {
+      // Failure might mean that there's no device found
+      const int current_device = cub::CurrentDevice();
+      if (current_device < 0)
+      {
+        cuda_cub::throw_on_error(cudaErrorNoDevice, "No GPU is available\n");
+      }
+
+      // Any subsequent failure means the provided device binary does not match 
+      // the generated function code
+      int major = 0, minor = 0;
+      cudaError_t attr_status;
+
+      attr_status = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device);
+      cuda_cub::throw_on_error(attr_status,
+                              "get_ptx_version :"
+                              "failed to get major CUDA device compute capability version.");
+
+      attr_status = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device);
+      cuda_cub::throw_on_error(attr_status,
+                              "get_ptx_version :"
+                              "failed to get minor CUDA device compute capability version.");
+        
+      // Index from which SM code has to start in the message below
+      int code_offset = 37;
+      char str[] = "This program was not compiled for SM     \n";
+
+      auto print_1_helper = [&](int v) {
+        str[code_offset] = static_cast<char>(v) + '0';
+        code_offset++;
+      };
+
+      // Assume two digits will be enough
+      auto print_2_helper = [&](int v) {
+        if (v / 10 != 0) {
+          print_1_helper(v / 10);
+        }
+        print_1_helper(v % 10);
+      };
+
+      print_2_helper(major);
+      print_2_helper(minor);
+
+      cuda_cub::throw_on_error(cudaErrorInvalidDevice, str);
+    }
+
+    return ptx_version;
+  }
+
+  THRUST_RUNTIME_FUNCTION
+  inline cudaError_t sync_stream(cudaStream_t stream)
+  {
+    return cub::SyncStream(stream);
+  }
+
+  inline void __device__ sync_threadblock()
+  {
+    cub::CTA_SYNC();
+  }
+
+#define CUDA_CUB_RET_IF_FAIL(e) \
+  {                             \
+    auto const error = (e);     \
+    if (cub::Debug(error, __FILE__, __LINE__)) return error; \
+  }
+
+  // uninitialized
+  // -------
+  // stores type in uninitialized form
+  //
+  template <class T>
+  struct uninitialized
+  {
+    typedef typename cub::UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+      WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    DeviceWord storage[WORDS];
+
+    __host__ __device__ __forceinline__ T& get()
+    {
+      return reinterpret_cast<T&>(*this);
+    }
+
+    __host__ __device__ __forceinline__ operator T&() { return get(); }
+  };
+
+  // uninitialized_array
+  // --------------
+  // allocates uninitialized data on stack
+  template<class T, size_t N>
+  struct array
+  {
+    typedef T value_type;
+    typedef T ref[N];
+    enum {SIZE = N};
+    private:
+      T data_[N];
+
+    public:
+      __host__ __device__ T* data() { return data_; }
+      __host__ __device__ const T* data() const { return data_; }
+      __host__ __device__ T& operator[](unsigned int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](unsigned int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ unsigned int size() const { return N; }
+      __host__ __device__ operator ref&() { return data_; }
+  };
+
+
+  // uninitialized_array
+  // --------------
+  // allocates uninitialized data on stack
+  template<class T, size_t N>
+  struct uninitialized_array
+  {
+    typedef T value_type;
+    typedef T ref[N];
+    enum {SIZE = N};
+    private:
+      char data_[N * sizeof(T)];
+
+    public:
+      __host__ __device__ T* data() { return data_; }
+      __host__ __device__ const T* data() const { return data_; }
+      __host__ __device__ T& operator[](unsigned int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](unsigned int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ T& operator[](int idx) { return ((T*)data_)[idx]; }
+      __host__ __device__ T const& operator[](int idx) const { return ((T*)data_)[idx]; }
+      __host__ __device__ unsigned int size() const { return N; }
+      __host__ __device__ operator ref&() { return *reinterpret_cast<ref*>(data_); }
+      __host__ __device__ ref& get_ref() { return (ref&)*this; }
+  };
+
+  __host__ __device__ __forceinline__ size_t align_to(size_t n, size_t align)
+  {
+    return ((n+align-1)/align) * align;
+  }
+
+  namespace host {
+    inline cuda_optional<size_t> get_max_shared_memory_per_block()
+    {
+      cudaError_t status = cudaSuccess;
+      int         dev_id = 0;
+      status             = cudaGetDevice(&dev_id);
+      if (status != cudaSuccess) return cuda_optional<size_t>(0, status);
+
+      int max_shmem = 0;
+      status        = cudaDeviceGetAttribute(&max_shmem,
+                                      cudaDevAttrMaxSharedMemoryPerBlock,
+                                      dev_id);
+      if (status != cudaSuccess) return cuda_optional<size_t>(0, status);
+      return cuda_optional<size_t>(max_shmem, status);
+    }
+  }
+
+  template <int           ALLOCATIONS>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  alias_storage(void*   storage_ptr,
+                size_t& storage_size,
+                void* (&allocations)[ALLOCATIONS],
+                size_t (&allocation_sizes)[ALLOCATIONS])
+  {
+    return cub::AliasTemporaries(storage_ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+  }
+
+
+}    // namespace core
+using core::sm60;
+using core::sm52;
+using core::sm35;
+using core::sm30;
+} // namespace cuda_
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/count.h b/thrust/system/cuda/detail/count.h
index c6ae90664..b624f39dc 100644
--- a/thrust/system/cuda/detail/count.h
+++ b/thrust/system/cuda/detail/count.h
@@ -1,22 +1,80 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class UnaryPred>
+typename iterator_traits<InputIt>::difference_type __host__ __device__
+count_if(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         UnaryPred                  unary_pred)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  typedef transform_input_iterator_t<size_type,
+                                     InputIt,
+                                     UnaryPred>
+      flag_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            flag_iterator_t(first, unary_pred),
+                            thrust::distance(first, last),
+                            size_type(0),
+                            plus<size_type>());
+}
+
+template <class Derived,
+          class InputIt,
+          class Value>
+typename iterator_traits<InputIt>::difference_type __host__ __device__
+count(execution_policy<Derived> &policy,
+      InputIt                    first,
+      InputIt                    last,
+      Value const &              value)
+{
+  return cuda_cub::count_if(policy,
+                            first,
+                            last,
+                            thrust::detail::equal_to_value<Value>(value));
+}
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/cross_system.h b/thrust/system/cuda/detail/cross_system.h
new file mode 100644
index 000000000..039531d28
--- /dev/null
+++ b/thrust/system/cuda/detail/cross_system.h
@@ -0,0 +1,339 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cpp/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+  template <class Sys1, class Sys2>
+  struct cross_system : execution_policy<cross_system<Sys1, Sys2> >
+  {
+    typedef thrust::execution_policy<Sys1> policy1;
+    typedef thrust::execution_policy<Sys2> policy2;
+
+    policy1 &sys1;
+    policy2 &sys2;
+
+    inline __host__ __device__
+    cross_system(policy1 &sys1, policy2 &sys2) : sys1(sys1), sys2(sys2) {}
+
+    inline __host__ __device__
+    cross_system<Sys2, Sys1> rotate() const
+    {
+      return cross_system<Sys2, Sys1>(sys2, sys1);
+    }
+  };
+
+#if THRUST_CPP_DIALECT >= 2011
+  // Device to host.
+  template <class Sys1, class Sys2>
+  constexpr __host__ __device__
+  auto direction_of_copy(
+    thrust::system::cuda::execution_policy<Sys1> const&
+  , thrust::cpp::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToHost
+    >{}
+  )
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  constexpr __host__ __device__
+  auto direction_of_copy(
+    thrust::cpp::execution_policy<Sys1> const&
+  , thrust::system::cuda::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyHostToDevice
+    >{}
+  )
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  constexpr __host__ __device__
+  auto direction_of_copy(
+    thrust::system::cuda::execution_policy<Sys1> const&
+  , thrust::system::cuda::execution_policy<Sys2> const&
+  )
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToDevice
+    >{}
+  )
+
+  // Device to device.
+  template <class DerivedPolicy>
+  constexpr __host__ __device__
+  auto direction_of_copy(execution_policy<DerivedPolicy> const &)
+  THRUST_DECLTYPE_RETURNS(
+    thrust::detail::integral_constant<
+      cudaMemcpyKind, cudaMemcpyDeviceToDevice
+    >{}
+  )
+
+  template <class Sys1, class Sys2>
+  constexpr __host__ __device__
+  auto direction_of_copy(
+    execution_policy<cross_system<Sys1, Sys2>> const &systems
+  )
+  THRUST_DECLTYPE_RETURNS(
+    direction_of_copy(
+      derived_cast(derived_cast(systems).sys1)
+    , derived_cast(derived_cast(systems).sys2)
+    )
+  )
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToHost == Direction::value
+  >
+  is_device_to_host_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  ) noexcept
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToHost == Direction::value
+  >
+  is_device_to_host_copy(ExecutionPolicy const& exec) noexcept
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyHostToDevice == Direction::value
+  >
+  is_host_to_device_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  ) noexcept
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyHostToDevice == Direction::value
+  >
+  is_host_to_device_copy(ExecutionPolicy const& exec) noexcept
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy0,
+            typename ExecutionPolicy1,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy0>(),
+                                         std::declval<ExecutionPolicy1>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToDevice == Direction::value
+  >
+  is_device_to_device_copy(
+    ExecutionPolicy0 const& exec0
+  , ExecutionPolicy1 const& exec1
+  ) noexcept
+  {
+    return {};
+  }
+
+  template <typename ExecutionPolicy,
+            // MSVC2015 WAR: put decltype here instead of in trailing return type
+            typename Direction =
+              decltype(direction_of_copy(std::declval<ExecutionPolicy>()))>
+  constexpr __host__ __device__
+  thrust::detail::integral_constant<
+    bool, cudaMemcpyDeviceToDevice == Direction::value
+  >
+  is_device_to_device_copy(ExecutionPolicy const& exec) noexcept
+  {
+    return {};
+  }
+
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> &sys1,
+                       thrust::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> const &sys1,
+                       thrust::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::execution_policy<Sys1> &,
+                       thrust::cuda::execution_policy<Sys2> &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::execution_policy<Sys1> const &,
+                       thrust::cuda::execution_policy<Sys2> const &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> &sys1,
+                       thrust::cuda::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_device_system(thrust::cuda::execution_policy<Sys1> const &sys1,
+                       thrust::cuda::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::cuda::execution_policy<Sys1> &,
+                     thrust::execution_policy<Sys2> &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::cuda::execution_policy<Sys1> const &,
+                     thrust::execution_policy<Sys2> const &sys2)
+  THRUST_DECLTYPE_RETURNS(sys2)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> &sys1,
+                     thrust::cuda::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> const &sys1,
+                     thrust::cuda::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> &sys1,
+                     thrust::execution_policy<Sys2> &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+
+  // Device to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  auto
+  select_host_system(thrust::execution_policy<Sys1> const &sys1,
+                     thrust::execution_policy<Sys2> const &)
+  THRUST_DECLTYPE_RETURNS(sys1)
+#endif
+
+  // Device to host.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  cross_system<Sys1, Sys2>
+  select_system(execution_policy<Sys1> const &             sys1,
+                thrust::cpp::execution_policy<Sys2> const &sys2)
+  {
+    thrust::execution_policy<Sys1> &     non_const_sys1 = const_cast<execution_policy<Sys1> &>(sys1);
+    thrust::cpp::execution_policy<Sys2> &non_const_sys2 = const_cast<thrust::cpp::execution_policy<Sys2> &>(sys2);
+    return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
+  }
+
+  // Host to device.
+  template <class Sys1, class Sys2>
+  __host__ __device__
+  cross_system<Sys1, Sys2>
+  select_system(thrust::cpp::execution_policy<Sys1> const &sys1,
+                execution_policy<Sys2> const &             sys2)
+  {
+    thrust::cpp::execution_policy<Sys1> &non_const_sys1 = const_cast<thrust::cpp::execution_policy<Sys1> &>(sys1);
+    thrust::execution_policy<Sys2> &     non_const_sys2 = const_cast<execution_policy<Sys2> &>(sys2);
+    return cross_system<Sys1, Sys2>(non_const_sys1, non_const_sys2);
+  }
+
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cuda/detail/cub.h b/thrust/system/cuda/detail/cub.h
deleted file mode 100644
index d4c77460d..000000000
--- a/thrust/system/cuda/detail/cub.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/bulk.h>
-
-// we need to carefully undefine and then redefined these macros to ensure that multiple
-// versions of cub can coexist in the same program
-// push_macro & pop_macro were introduced to gcc in version 4.3
-
-// if the macros are already defined, save them and undefine them
-
-#if !defined(__GNUC__) || (THRUST_GCC_VERSION >= 40300)
-#  ifdef CUB_NS_PREFIX
-#    pragma push_macro("CUB_NS_PREFIX")
-#    undef CUB_NS_PREFIX
-#    define CUB_NS_PREFIX_NEEDS_RESTORE
-#  endif
-#  ifdef CUB_NS_POSTFIX
-#    pragma push_macro("CUB_NS_POSTFIX")
-#    undef CUB_NS_POSTFIX
-#    define CUB_NS_POSTFIX_NEEDS_RESTORE
-#  endif
-#  ifdef CUB_CDP
-#    pragma push_macro("CUB_CDP")
-#    undef CUB_CDP
-#    define CUB_CDP_NEEDS_RESTORE
-#  endif
-#  ifdef cub
-#    pragma push_macro("cub")
-#    undef cub
-#    define CUB_NEEDS_RESTORE
-#  endif
-#endif // __GNUC__
-
-// define the macros while we #include our version of cub
-#define CUB_NS_PREFIX namespace thrust { namespace system { namespace cuda { namespace detail {
-#define CUB_NS_POSTFIX                  }                  }                }                  }
-
-#if __BULK_HAS_CUDART__
-#define CUB_CDP 1
-#endif
-
-// rename "cub" so it doesn't collide with another installation elsewhere
-#define cub cub_
-
-#include <thrust/system/cuda/detail/cub/util_namespace.cuh>
-#include <thrust/system/cuda/detail/cub/cub.cuh>
-
-// undef the top-level namespace name
-#undef cub
-
-// undef the macros
-#undef CUB_NS_PREFIX
-#undef CUB_NS_POSTFIX
-
-#ifdef CUB_CDP
-#  undef CUB_CDP
-#endif
-
-// redefine the macros if they were defined previously
-
-#if !defined(__GNUC__) || (THRUST_GCC_VERSION >= 40300)
-#  ifdef CUB_NS_PREFIX_NEEDS_RESTORE
-#    pragma pop_macro("CUB_NS_PREFIX")
-#    undef CUB_NS_PREFIX_NEEDS_RESTORE
-#  endif
-#  ifdef CUB_NS_POSTFIX_NEEDS_RESTORE
-#    pragma pop_macro("CUB_NS_POSTFIX")
-#    undef CUB_NS_POSTFIX_NEEDS_RESTORE
-#  endif
-#  ifdef CUB_CDP_NEEDS_RESTORE
-#    pragma pop_macro("CUB_CDP")
-#    undef CUB_CDP_NEEDS_RESTORE
-#  endif
-#  ifdef CUB_NEEDS_RESTORE
-#    pragma pop_macro("cub")
-#    undef CUB_NEEDS_RESTORE
-#  endif
-#endif // __GNUC__
-
diff --git a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh b/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
deleted file mode 100644
index 62bc49cbf..000000000
--- a/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh
+++ /dev/null
@@ -1,1090 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                The data type to be flagged.
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
- *   that differ from their predecessors (or successors).  For example, head flags are convenient
- *   for demarcating disjoint data segments as part of a segmented scan or reduction.
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockDiscontinuity}
- * \par
- * The code snippet below illustrates the head flagging of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
- *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
- *
- *     // Allocate shared memory for BlockDiscontinuity
- *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute head flags for discontinuities in the segment
- *     int head_flags[4];
- *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
- * The corresponding output \p head_flags in those threads will be
- * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
- *
- * \par Performance Considerations
- * - Incurs zero bank conflicts for most types
- *
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         BLOCK_DIM_Y     = 1,
-    int         BLOCK_DIM_Z     = 1,
-    int         PTX_ARCH        = CUB_PTX_ARCH>
-class BlockDiscontinuity
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-
-    /// Shared memory storage layout type (last element from each thread's input)
-    struct _TempStorage
-    {
-        T first_items[BLOCK_THREADS];
-        T last_items[BLOCK_THREADS];
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /// Specialization for when FlagOp has third index param
-    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
-    struct ApplyOp
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b, idx);
-        }
-    };
-
-    /// Specialization for when FlagOp does not have a third index param
-    template <typename FlagOp>
-    struct ApplyOp<FlagOp, false>
-    {
-        // Apply flag operator
-        static __device__ __forceinline__ bool Flag(FlagOp flag_op, const T &a, const T &b, int idx)
-        {
-            return flag_op(a, b);
-        }
-    };
-
-    /// Templated unrolling of item comparison (inductive case)
-    template <int ITERATION, int MAX_ITERATIONS>
-    struct Iterate
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITERATION - 1],
-                input[ITERATION],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, flag_op);
-        }
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {
-            flags[ITERATION] = ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITERATION],
-                input[ITERATION + 1],
-                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
-
-            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
-        }
-
-    };
-
-    /// Templated unrolling of item comparison (termination case)
-    template <int MAX_ITERATIONS>
-    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
-    {
-        // Head flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagHeads(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {}
-
-        // Tail flags
-        template <
-            int             ITEMS_PER_THREAD,
-            typename        FlagT,
-            typename        FlagOp>
-        static __device__ __forceinline__ void FlagTails(
-            int                     linear_tid,
-            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
-            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
-        {}
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockDiscontinuity}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockDiscontinuity(
-        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
-     * The corresponding output \p head_flags in those threads will be
-     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        head_flags[0] = (linear_tid == 0) ?
-            1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                temp_storage.last_items[linear_tid - 1],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Collectively compute head flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagHeads(
-     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
-     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
-     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeads(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-    {
-        // Share last item
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        T predecessor_item = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            predecessor_item,
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
-     * The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        __syncthreads();
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute tail flags for discontinuities in the segment
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagTails(
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
-        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-    {
-        // Share first item
-        temp_storage.first_items[linear_tid] = input[0];
-
-        __syncthreads();
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Head & tail flag operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        head_flags[0] = (linear_tid == 0) ?
-            1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                temp_storage.last_items[linear_tid - 1],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_predecessor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
-     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        head_flags[0] = (linear_tid == 0) ?
-            1 :                                 // First thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                temp_storage.last_items[linear_tid - 1],
-                input[0],
-                linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        T predecessor_item = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            predecessor_item,
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
-            1 :                             // Last thread
-            ApplyOp<FlagOp>::Flag(
-                flag_op,
-                input[ITEMS_PER_THREAD - 1],
-                temp_storage.first_items[linear_tid + 1],
-                (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-    /**
-     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
-     *
-     * \par
-     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
-     *   returns \p true (where <em>previous-item</em> is either the preceding item
-     *   in the same thread or the last item in the previous thread).
-     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
-     *   against \p tile_predecessor_item.
-     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
-     *   <tt>input<sub><em>i</em></sub></tt> when
-     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
-     *   returns \p true (where <em>next-item</em> is either the next item
-     *   in the same thread or the first item in the next thread).
-     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
-     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
-     *   against \p tile_successor_item.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
-     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
-     *
-     *     // Allocate shared memory for BlockDiscontinuity
-     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Have thread0 obtain the predecessor item for the entire tile
-     *     int tile_predecessor_item;
-     *     if (threadIdx.x == 0) tile_predecessor_item == ...
-     *
-     *     // Have thread127 obtain the successor item for the entire tile
-     *     int tile_successor_item;
-     *     if (threadIdx.x == 127) tile_successor_item == ...
-     *
-     *     // Collectively compute head and flags for discontinuities in the segment
-     *     int head_flags[4];
-     *     int tail_flags[4];
-     *     BlockDiscontinuity(temp_storage).FlagTails(
-     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
-     *         thread_data, cub::Inequality());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
-     * that the \p tile_predecessor_item is \p 0, and that the
-     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
-     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
-     * and the corresponding output \p tail_flags in those threads will be
-     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
-     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        FlagT,
-        typename        FlagOp>
-    __device__ __forceinline__ void FlagHeadsAndTails(
-        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
-        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
-        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
-        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
-        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
-        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
-    {
-        // Share first and last items
-        temp_storage.first_items[linear_tid] = input[0];
-        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
-
-        __syncthreads();
-
-        // Set flag for first thread-item
-        T predecessor_item = (linear_tid == 0) ?
-            tile_predecessor_item :              // First thread
-            temp_storage.last_items[linear_tid - 1];
-
-        head_flags[0] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            predecessor_item,
-            input[0],
-            linear_tid * ITEMS_PER_THREAD);
-
-        // Set flag for last thread-item
-        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
-            tile_successor_item :              // Last thread
-            temp_storage.first_items[linear_tid + 1];
-
-        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::Flag(
-            flag_op,
-            input[ITEMS_PER_THREAD - 1],
-            successor_item,
-            (linear_tid * ITEMS_PER_THREAD) + (ITEMS_PER_THREAD - 1));
-
-        // Set head_flags for remaining items
-        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, flag_op);
-
-        // Set tail_flags for remaining items
-        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
-    }
-
-
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/block/block_exchange.cuh b/thrust/system/cuda/detail/cub/block/block_exchange.cuh
deleted file mode 100644
index 34aabdd44..000000000
--- a/thrust/system/cuda/detail/cub/block/block_exchange.cuh
+++ /dev/null
@@ -1,1132 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - It is commonplace for blocks of threads to rearrange data items between
- *   threads.  For example, the global memory subsystem prefers access patterns
- *   where data items are "striped" across threads (where consecutive threads access consecutive items),
- *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
- *   (where consecutive items belong to a single thread).
- * - BlockExchange supports the following types of data exchanges:
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
- *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
- *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
- *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
- * - \blocked
- *
- * \par A Simple Example
- * \blockcollective{BlockExchange}
- * \par
- * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
- * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
- *
- *     // Allocate shared memory for BlockExchange
- *     __shared__ typename BlockExchange::TempStorage temp_storage;
- *
- *     // Load a tile of data striped across threads
- *     int thread_data[4];
- *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
- *
- *     // Collectively exchange data into a blocked arrangement across threads
- *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of striped input \p thread_data across the block of threads is
- * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- * \par Performance Considerations
- * - Proper device-specific padding ensures zero bank conflicts for most types.
- *
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,
-    int         ITEMS_PER_THREAD,
-    bool        WARP_TIME_SLICING   = false,
-    int         BLOCK_DIM_Y         = 1,
-    int         BLOCK_DIM_Z         = 1,
-    int         PTX_ARCH            = CUB_PTX_ARCH>
-class BlockExchange
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
-
-        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
-        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
-
-        // Insert padding if the number of items per thread is a power of two
-        INSERT_PADDING              = 0, // Mooch PowerOfTwo<ITEMS_PER_THREAD>::VALUE,
-        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    typedef T _TempStorage[TIME_SLICED_ITEMS + PADDING_ITEMS];
-
-public:
-
-    /// \smemstorage{BlockExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-    int lane_id;
-    int warp_id;
-    int warp_offset;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __threadfence_block();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    /**
-     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        if (warp_id == 0)
-        {
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                temp_storage[item_offset] = items[ITEM];
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                items[ITEM] = temp_storage[item_offset];
-            }
-        }
-
-        #pragma unroll
-        for (int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-
-                __threadfence_block();
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        // No timeslicing
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        // Warp time-slicing
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Write a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_storage[item_offset] = items[ITEM];
-                    }
-                }
-            }
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __threadfence_block();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
-            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T               items[ITEMS_PER_THREAD],   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-        Int2Type<true>  time_slicing)
-    {
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    temp_storage[item_offset] = items[ITEM];
-                }
-
-                __threadfence_block();
-
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
-                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                    items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    /**
-     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true>  time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            __syncthreads();
-
-            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-                {
-                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_items[ITEM] = temp_storage[item_offset];
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<false> time_slicing)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-
-    /**
-     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
-        Int2Type<true> time_slicing)
-    {
-        T temp_items[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
-        {
-            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
-            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                int item_offset = ranks[ITEM] - SLICE_OFFSET;
-                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
-                {
-                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-                    temp_storage[item_offset] = items[ITEM];
-                }
-            }
-
-            __syncthreads();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                // Read a strip of items
-                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
-                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
-
-                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
-                {
-                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
-                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
-                    {
-                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
-                        temp_items[ITEM] = temp_storage[item_offset];
-                    }
-                }
-            }
-        }
-
-        // Copy
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            items[ITEM] = temp_items[ITEM];
-        }
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId()),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockExchange(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId()),
-        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Structured exchanges
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a striped arrangement across block threads
-     *     int thread_data[4];
-     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of striped input \p thread_data across the block of threads is
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from global memory.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void StripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
-    {
-        StripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToStriped(thread_data);
-     *
-     *     // Store data striped across block threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
-     * preparation for storing to global memory.
-     *
-     */
-    __device__ __forceinline__ void BlockedToStriped(
-        T               items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
-    {
-        BlockedToStriped(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
-     *     int thread_data[4];
-     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
-     *
-     *     // Collectively exchange data into a blocked arrangement across threads
-     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of warp-striped input \p thread_data across the block of threads is
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * after loading from global memory.  (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void WarpStripedToBlocked(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>warp-striped</em> and <em>blocked</em> arrangements.
-    {
-        WarpStripedToBlocked(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-    /**
-     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
-     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
-     *
-     *     // Allocate shared memory for BlockExchange
-     *     __shared__ typename BlockExchange::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively exchange data into a warp-striped arrangement across threads
-     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data);
-     *
-     *     // Store data striped across warp threads into an ordered tile
-     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of blocked input \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
-     * in preparation for storing to global memory. (The first 128 items are striped across
-     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
-     *
-     */
-    __device__ __forceinline__ void BlockedToWarpStriped(
-        T                items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>blocked</em> and <em>warp-striped</em> arrangements.
-    {
-        BlockedToWarpStriped(items, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Scatter exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToBlocked(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToBlocked(items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
-    {
-        ScatterToStriped(items, ranks, Int2Type<WARP_TIME_SLICING>());
-    }
-
-
-    /**
-     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToStripedGuarded(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (ranks[ITEM] >= 0)
-                temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     * \tparam ValidFlag                            <b>[inferred]</b> Flag type denoting which items are valid
-     */
-    template <typename Offset, typename ValidFlag>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
-        ValidFlag       is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = ranks[ITEM];
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            if (is_valid[ITEM])
-                temp_storage[item_offset] = items[ITEM];
-        }
-
-        __syncthreads();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-    //@}  end member group
-
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-template <
-    typename    T,
-    int         ITEMS_PER_THREAD,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpExchange
-{
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        // Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-
-        // Insert padding if the number of items per thread is a power of two
-        INSERT_PADDING              = 0, // Mooch PowerOfTwo<ITEMS_PER_THREAD>::VALUE,
-        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    typedef T _TempStorage[WARP_ITEMS + PADDING_ITEMS];
-
-public:
-
-    /// \smemstorage{WarpExchange}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-public:
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpExchange(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \tparam Offset                               <b>[inferred]</b> Signed integer type for local offsets
-     */
-    template <typename Offset>
-    __device__ __forceinline__ void ScatterToStriped(
-        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
-        Offset          ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
-            temp_storage[ranks[ITEM]] = items[ITEM];
-        }
-
-        __threadfence_block();
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
-            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
-            items[ITEM] = temp_storage[item_offset];
-        }
-    }
-
-};
-
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_histogram.cuh b/thrust/system/cuda/detail/cub/block/block_histogram.cuh
deleted file mode 100644
index 1ec783889..000000000
--- a/thrust/system/cuda/detail/cub/block/block_histogram.cuh
+++ /dev/null
@@ -1,415 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_histogram_sort.cuh"
-#include "specializations/block_histogram_atomic.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
- */
-enum BlockHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * Sorting followed by differentiation.  Execution is comprised of two phases:
-     * -# Sort the data using efficient radix sort
-     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     */
-    BLOCK_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * Use atomic addition to update byte counts directly
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     */
-    BLOCK_HISTO_ATOMIC,
-};
-
-
-
-/******************************************************************************
- * Block histogram
- ******************************************************************************/
-
-
-/**
- * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam BINS                 The number bins within the histogram
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- * - BlockHistogram can be optionally specialized to use different algorithms:
- *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
- *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockHistogram}
- * \par
- * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
- * are partitioned across 128 threads where each thread owns 4 samples.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
- *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
- *
- *     // Allocate shared memory for BlockHistogram
- *     __shared__ typename BlockHistogram::TempStorage temp_storage;
- *
- *     // Allocate shared memory for block-wide histogram bin counts
- *     __shared__ unsigned int smem_histogram[256];
- *
- *     // Obtain input samples per thread
- *     unsigned char data[4];
- *     ...
- *
- *     // Compute the block-wide histogram
- *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
- *
- * \endcode
- *
- * \par Performance and Usage Considerations
- * - The histogram output can be constructed in shared or global memory
- * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    int                     BINS,
-    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockHistogram
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
-     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
-     * regardless.
-     */
-    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
-            BLOCK_HISTO_SORT :
-            ALGORITHM;
-
-    /// Internal specialization.
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
-        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
-        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
-
-    /// Shared memory storage layout type for BlockHistogram
-    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /// \smemstorage{BlockHistogram}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockHistogram(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Histogram operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Initialize the shared histogram counters to zero.
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <typename HistoCounter>
-    __device__ __forceinline__ void InitHistogram(HistoCounter histogram[BINS])
-    {
-        // Initialize histogram bin counts to zeros
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            histogram[histo_offset + linear_tid] = 0;
-        }
-    }
-
-
-    /**
-     * \brief Constructs a block-wide histogram in shared/global memory.  Each thread contributes an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
-     * are partitioned across 128 threads where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Compute the block-wide histogram
-     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Histogram(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                ///< [out] Reference to shared/global memory histogram
-    {
-        // Initialize histogram bin counts to zeros
-        InitHistogram(histogram);
-
-        __syncthreads();
-
-        // Composite the histogram
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-
-
-    /**
-     * \brief Updates an existing block-wide histogram in shared/global memory.  Each thread composites an array of input elements.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a the initialization and update of a
-     * histogram of 512 integer samples that are partitioned across 128 threads
-     * where each thread owns 4 samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
-     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
-     *
-     *     // Allocate shared memory for BlockHistogram
-     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
-     *
-     *     // Allocate shared memory for block-wide histogram bin counts
-     *     __shared__ unsigned int smem_histogram[256];
-     *
-     *     // Obtain input samples per thread
-     *     unsigned char thread_samples[4];
-     *     ...
-     *
-     *     // Initialize the block-wide histogram
-     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
-     *
-     *     // Update the block-wide histogram
-     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
-     *
-     * \endcode
-     *
-     * \tparam HistoCounter         <b>[inferred]</b> Histogram counter type
-     */
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        InternalBlockHistogram(temp_storage).Composite(items, histogram);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_load.cuh b/thrust/system/cuda/detail/cub/block/block_load.cuh
deleted file mode 100644
index afa8ff7cf..000000000
--- a/thrust/system/cuda/detail/cub/block/block_load.cuh
+++ /dev/null
@@ -1,1086 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for reading linear tiles of data into the CUDA thread block.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int bounds = valid_items - (linear_tid * ITEMS_PER_THREAD);
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM < bounds)
-        {
-            items[ITEM] = block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectBlocked(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = oob_default;
-    }
-
-    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-}
-
-
-/**
- * \brief Load a linear segment of items into a blocked arrangement across the thread block.
- *
- * \blocked
- *
- * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
- *
- * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD>
-__device__ __forceinline__ void LoadDirectBlockedVectorized(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T               *block_ptr,                 ///< [in] Input pointer for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
-
-    // Vector items
-    Vector vec_items[VECTORS_PER_THREAD];
-
-    // Aliased input ptr
-    Vector *ptr = reinterpret_cast<Vector*>(block_ptr + (linear_tid * VEC_SIZE * VECTORS_PER_THREAD));
-
-    // Load directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
-    {
-        vec_items[ITEM] = ptr[ITEM];
-    }
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = reinterpret_cast<T*>(vec_items)[ITEM];
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[(ITEM * BLOCK_THREADS) + linear_tid];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int bounds = valid_items - linear_tid;
-
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM * BLOCK_THREADS < bounds)
-        {
-            items[ITEM] = block_itr[linear_tid + (ITEM * BLOCK_THREADS)];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    int             BLOCK_THREADS,
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,                ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = oob_default;
-    }
-
-    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items)                ///< [in] Number of valid items to load
-{
-    int tid                 = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid                 = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset         = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-    int bounds              = valid_items - warp_offset - tid;
-
-    // Load directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * CUB_PTX_WARP_THREADS) < bounds)
-        {
-            items[ITEM] = block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)];
-        }
-    }
-}
-
-
-/**
- * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to load.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam InputIterator        <b>[inferred]</b> The random-access iterator type for input \iterator.
- */
-template <
-    typename        T,
-    int             ITEMS_PER_THREAD,
-    typename        InputIterator>
-__device__ __forceinline__ void LoadDirectWarpStriped(
-    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-    T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-    int             valid_items,               ///< [in] Number of valid items to load
-    T               oob_default)                ///< [in] Default value to assign out-of-bound items
-{
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        items[ITEM] = oob_default;
-    }
-
-    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilIo
-
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockLoad abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
- */
-enum BlockLoadAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory.  The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub>
-     * reads the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_LOAD_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read directly
-     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
-     * The thread block reads items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector loads to
-     * read the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p InputIterator is not a simple pointer type
-     *   - The block input offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_LOAD_VECTORIZE,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). The thread block
-     * reads items in a parallel "strip-mining" fashion:
-     * thread<sub><em>i</em></sub> reads items having stride \p BLOCK_THREADS
-     * between them. cub::BlockExchange is then used to locally reorder the items
-     * into a [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_TRANSPOSE,
-
-
-    /**
-     * \par Overview
-     *
-     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is read
-     * directly from memory and then is locally transposed into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3). Each warp reads its own
-     * contiguous segment in a parallel "strip-mining" fashion: lane<sub><em>i</em></sub>
-     * reads items having stride \p WARP_THREADS between them. cub::BlockExchange
-     * is then used to locally reorder the items into a
-     * [<em>blocked arrangement</em>](index.html#sec5sec3).
-     *
-     * \par Usage Considerations
-     * - BLOCK_THREADS must be a multiple of WARP_THREADS
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items loaded per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
-     */
-    BLOCK_LOAD_WARP_TRANSPOSE,
-};
-
-
-/**
- * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam InputIterator        The input iterator type \iterator.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockLoad class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockLoad can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory using CUDA's built-in vectorized loads as a
- *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
- *      of data is read directly from memory and is then locally transposed into a
- *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockLoad}
- * \par
- * The code snippet below illustrates the loading of a linear
- * segment of 512 integers into a "blocked" arrangement across 128 threads where each
- * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
- * meaning memory references are efficiently coalesced using a warp-striped access
- * pattern (after which items are locally reordered among threads).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
- *
- *     // Allocate shared memory for BlockLoad
- *     __shared__ typename BlockLoad::TempStorage temp_storage;
- *
- *     // Load a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     BlockLoad(temp_storage).Load(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- * The set of \p thread_data across the block of threads in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename            InputIterator,
-    int                 BLOCK_DIM_X,
-    int                 ITEMS_PER_THREAD,
-    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
-    bool                WARP_TIME_SLICING   = false,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockLoad
-{
-private:
-
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Load helper
-    template <BlockLoadAlgorithm _POLICY, int DUMMY>
-    struct LoadInternal;
-
-
-    /**
-     * BLOCK_LOAD_DIRECT specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_VECTORIZE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Load(
-            T               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
-        {
-            LoadDirectBlockedVectorized(linear_tid, block_ptr, items);
-        }
-
-        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
-        template <
-            typename T,
-            typename _InputIterator>
-        __device__ __forceinline__ void Load(
-            _InputIterator    block_itr,                  ///< [in] The thread block's base input iterator for loading from
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).StripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).StripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).StripedToBlocked(items);
-        }
-
-    };
-
-
-    /**
-     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
-     */
-    template <int DUMMY>
-    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ LoadInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Load a linear segment of items from memory
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
-        }
-
-        /// Load a linear segment of items from memory, guarded by range
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items)                    ///< [in] Number of valid items to load
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
-        }
-
-
-        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-        __device__ __forceinline__ void Load(
-            InputIterator   block_itr,                      ///< [in] The thread block's base input iterator for loading from
-            T               (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
-            int             valid_items,                    ///< [in] Number of valid items to load
-            T               oob_default)                    ///< [in] Default value to assign out-of-bound items
-        {
-            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
-            BlockExchange(temp_storage).WarpStripedToBlocked(items);
-        }
-    };
-
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalLoad::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-    /// \smemstorage{BlockLoad}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockLoad(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Load a linear segment of items from memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items remaining unassigned).
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items)                ///< [in] Number of valid items to load
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
-    }
-
-
-    /**
-     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded loading of a linear
-     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
-     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
-     * meaning memory references are efficiently coalesced using a warp-striped access
-     * pattern (after which items are locally reordered among threads).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
-     *
-     *     // Allocate shared memory for BlockLoad
-     *     __shared__ typename BlockLoad::TempStorage temp_storage;
-     *
-     *     // Load a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
-     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
-     * The set of \p thread_data across the block of threads in those threads will be
-     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
-     * being unmasked to load portions of valid data (and other items are assigned \p -1)
-     *
-     */
-    __device__ __forceinline__ void Load(
-        InputIterator   block_itr,                  ///< [in] The thread block's base input iterator for loading from
-        T               (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
-        int             valid_items,                ///< [in] Number of valid items to load
-        T               oob_default)                ///< [in] Default value to assign out-of-bound items
-    {
-        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
-    }
-
-
-    //@}  end member group
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh b/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
deleted file mode 100644
index 4b5a6a761..000000000
--- a/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh
+++ /dev/null
@@ -1,485 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_scan.cuh"
-#include "../block/block_scan.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
- * \ingroup BlockModule
- *
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam RADIX_BITS           The number of radix bits per digit place
- * \tparam DESCENDING           Whether or not the sorted-order is high-to-low
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * Blah...
- * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
- * - \blocked
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par Examples
- * \par
- * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
- *      \code
- *      #include <cub/cub.cuh>
- *
- *      template <int BLOCK_THREADS>
- *      __global__ void ExampleKernel(...)
- *      {
- *
- *      \endcode
- */
-template <
-    int                     BLOCK_DIM_X,
-    int                     RADIX_BITS,
-    bool                    DESCENDING,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixRank
-{
-private:
-
-    /******************************************************************************
-     * Type definitions and constants
-     ******************************************************************************/
-
-    // Integer type for digit counters (to be packed into words of type PackedCounters)
-    typedef unsigned short DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
-        unsigned long long,
-        unsigned int>::Type PackedCounter;
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        RADIX_DIGITS                = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_COUNTER           = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
-        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
-
-        // The number of packed counters per thread (plus one for padding)
-        RAKING_SEGMENT              = COUNTER_LANES + 1,
-
-        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
-        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
-    };
-
-
-    /// BlockScan type
-    typedef BlockScan<
-            PackedCounter,
-            BLOCK_DIM_X,
-            INNER_SCAN_ALGORITHM,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockScan;
-
-
-    /// Shared memory storage layout type for BlockRadixRank
-    struct _TempStorage
-    {
-        // Storage for scanning local ranks
-        typename BlockScan::TempStorage block_scan;
-
-        union
-        {
-            DigitCounter            digit_counters[COUNTER_LANES + 1][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
-        };
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-    /// Copy of raking segment, promoted to registers
-    PackedCounter cached_segment[RAKING_SEGMENT];
-
-
-    /******************************************************************************
-     * Templated iteration
-     ******************************************************************************/
-
-    // General template iteration
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        /**
-         * Decode keys.  Decodes the radix digit from the current digit place
-         * and increments the thread's corresponding counter in shared
-         * memory for that digit.
-         *
-         * Saves both (1) the prior value of that counter (the key's
-         * thread-local exclusive prefix sum for that digit), and (2) the shared
-         * memory offset of the counter (for later use).
-         */
-        template <typename UnsignedBits, int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,                                   // BlockRadixRank instance
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],               // Key to decode
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value (out parameter)
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],     // Counter smem offset (out parameter)
-            int             current_bit,                            // The least-significant bit position of the current digit to extract
-            int             num_bits)                               // The number of bits in the current digit
-        {
-            // Get digit
-            UnsignedBits digit = BFE(keys[COUNT], current_bit, num_bits);
-
-            // Get sub-counter
-            UnsignedBits sub_counter = digit >> LOG_COUNTER_LANES;
-
-            // Get counter lane
-            UnsignedBits counter_lane = digit & (COUNTER_LANES - 1);
-
-            if (DESCENDING)
-            {
-                sub_counter = PACKING_RATIO - 1 - sub_counter;
-                counter_lane = COUNTER_LANES - 1 - counter_lane;
-            }
-
-            // Pointer to smem digit counter
-            digit_counters[COUNT] = &cta.temp_storage.digit_counters[counter_lane][cta.linear_tid][sub_counter];
-
-            // Load thread-exclusive prefix
-            thread_prefixes[COUNT] = *digit_counters[COUNT];
-
-            // Store inclusive prefix
-            *digit_counters[COUNT] = thread_prefixes[COUNT] + 1;
-
-            // Iterate next key
-            Iterate<COUNT + 1, MAX>::DecodeKeys(cta, keys, thread_prefixes, digit_counters, current_bit, num_bits);
-        }
-
-
-        // Termination
-        template <int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],              // Local ranks (out parameter)
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],    // Prefix counter value
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD])     // Counter smem offset
-        {
-            // Add in threadblock exclusive prefix
-            ranks[COUNT] = thread_prefixes[COUNT] + *digit_counters[COUNT];
-
-            // Iterate next key
-            Iterate<COUNT + 1, MAX>::UpdateRanks(ranks, thread_prefixes, digit_counters);
-        }
-    };
-
-
-    // Termination
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // DecodeKeys
-        template <typename UnsignedBits, int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void DecodeKeys(
-            BlockRadixRank  &cta,
-            UnsignedBits    (&keys)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter*   (&digit_counters)[KEYS_PER_THREAD],
-            int             current_bit,                            // The least-significant bit position of the current digit to extract
-            int             num_bits)                               // The number of bits in the current digit
-        {}
-
-
-        // UpdateRanks
-        template <int KEYS_PER_THREAD>
-        static __device__ __forceinline__ void UpdateRanks(
-            int             (&ranks)[KEYS_PER_THREAD],
-            DigitCounter    (&thread_prefixes)[KEYS_PER_THREAD],
-            DigitCounter    *(&digit_counters)[KEYS_PER_THREAD])
-        {}
-    };
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /**
-     * Internal storage allocator
-     */
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /**
-     * Performs upsweep raking reduction, returning the aggregate
-     */
-    __device__ __forceinline__ PackedCounter Upsweep()
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
-        PackedCounter *raking_ptr;
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data into registers
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                cached_segment[i] = smem_raking_ptr[i];
-            }
-            raking_ptr = cached_segment;
-        }
-        else
-        {
-            raking_ptr = smem_raking_ptr;
-        }
-
-        return ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        PackedCounter raking_partial)
-    {
-        PackedCounter *smem_raking_ptr = temp_storage.raking_grid[linear_tid];
-
-        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
-            cached_segment :
-            smem_raking_ptr;
-
-        // Exclusive raking downsweep scan
-        ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
-
-        if (MEMOIZE_OUTER_SCAN)
-        {
-            // Copy data back to smem
-            #pragma unroll
-            for (int i = 0; i < RAKING_SEGMENT; i++)
-            {
-                smem_raking_ptr[i] = cached_segment[i];
-            }
-        }
-    }
-
-
-    /**
-     * Reset shared memory digit counters
-     */
-    __device__ __forceinline__ void ResetCounters()
-    {
-        // Reset shared memory digit counters
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES + 1; LANE++)
-        {
-            *((PackedCounter*) temp_storage.digit_counters[LANE][linear_tid]) = 0;
-        }
-    }
-
-
-    /**
-     * Scan shared memory digit counters.
-     */
-    __device__ __forceinline__ void ScanCounters()
-    {
-        // Upsweep scan
-        PackedCounter raking_partial = Upsweep();
-
-        // Compute exclusive sum
-        PackedCounter exclusive_partial;
-        PackedCounter packed_aggregate;
-        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, packed_aggregate);
-
-        // Propagate totals in packed fields
-        #pragma unroll
-        for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
-        {
-            exclusive_partial += packed_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
-        }
-
-        // Downsweep scan with exclusive partial
-        ExclusiveDownsweep(exclusive_partial);
-    }
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixRank(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Raking
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Rank keys.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits)                           ///< [in] The number of bits in the current digit
-    {
-        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
-        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
-
-        // Reset shared memory digit counters
-        ResetCounters();
-
-        // Decode keys and update digit counters
-        Iterate<0, KEYS_PER_THREAD>::DecodeKeys(*this, keys, thread_prefixes, digit_counters, current_bit, num_bits);
-
-        __syncthreads();
-
-        // Scan shared memory counters
-        ScanCounters();
-
-        __syncthreads();
-
-        // Extract the local ranks of each key
-        Iterate<0, KEYS_PER_THREAD>::UpdateRanks(ranks, thread_prefixes, digit_counters);
-    }
-
-
-    /**
-     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
-     */
-    template <
-        typename        UnsignedBits,
-        int             KEYS_PER_THREAD>
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
-        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
-        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
-        int             num_bits,                           ///< [in] The number of bits in the current digit
-        int             &inclusive_digit_prefix)            ///< [out] The incluisve prefix sum for the digit threadIdx.x
-    {
-        // Rank keys
-        RankKeys(keys, ranks, current_bit, num_bits);
-
-        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (linear_tid < RADIX_DIGITS))
-        {
-            int bin_idx = (DESCENDING) ?
-                RADIX_DIGITS - linear_tid - 1 :
-                linear_tid;
-
-            // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
-            // first counter column, resulting in unavoidable bank conflicts.)
-            int counter_lane = (bin_idx & (COUNTER_LANES - 1));
-            int sub_counter = bin_idx >> (LOG_COUNTER_LANES);
-            inclusive_digit_prefix = temp_storage.digit_counters[counter_lane + 1][0][sub_counter];
-        }
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh b/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
deleted file mode 100644
index 032f36783..000000000
--- a/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh
+++ /dev/null
@@ -1,863 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
- */
-
-
-#pragma once
-
-#include "block_exchange.cuh"
-#include "block_radix_rank.cuh"
-#include "../util_ptx.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
- * \ingroup BlockModule
- *
- * \tparam Key                  Key type
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of items per thread
- * \tparam Value                <b>[optional]</b> Value type (default: cub::NullType, which indicates a keys-only sort)
- * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
- * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
- * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
- * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- *   items into ascending order.  It relies upon a positional representation for
- *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- *   characters, etc.) specified from least-significant to most-significant.  For a
- *   given input sequence of keys and a set of rules specifying a total ordering
- *   of the symbolic alphabet, the radix sorting method produces a lexicographic
- *   ordering of those keys.
- * - BlockRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- *   <tt>unsigned char</tt>, \p int, \p double, etc.  Within each key, the implementation treats fixed-length
- *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
- *   method can only be applied to unsigned integral types, BlockRadixSort
- *   is able to sort signed and floating-point types via simple bit-wise transformations
- *   that ensure lexicographic key ordering.
- * - \rowmajor
- *
- * \par Performance Considerations
- * - \granularity
- *
- * \par A Simple Example
- * \blockcollective{BlockRadixSort}
- * \par
- * The code snippet below illustrates a sort of 512 integer keys that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
- *
- *     // Allocate shared memory for BlockRadixSort
- *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_keys[4];
- *     ...
- *
- *     // Collectively sort the keys
- *     BlockRadixSort(temp_storage).Sort(thread_keys);
- *
- *     ...
- * \endcode
- * \par
- * Suppose the set of input \p thread_keys across the block of threads is
- * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
- * corresponding output \p thread_keys in those threads will be
- * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
- *
- */
-template <
-    typename                Key,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    typename                Value                   = NullType,
-    int                     RADIX_BITS              = 4,
-    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
-    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
-    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
-    int                     BLOCK_DIM_Y             = 1,
-    int                     BLOCK_DIM_Z             = 1,
-    int                     PTX_ARCH                = CUB_PTX_ARCH>
-class BlockRadixSort
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        // The thread block size in threads
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        // Whether or not there are values to be trucked along with keys
-        KEYS_ONLY                   = Equals<Value, NullType>::VALUE,
-    };
-
-    // Key traits and unsigned bits type
-    typedef NumericTraits<Key>                  KeyTraits;
-    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
-
-    /// Ascending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            false,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        AscendingBlockRadixRank;
-
-    /// Descending BlockRadixRank utility type
-    typedef BlockRadixRank<
-            BLOCK_DIM_X,
-            RADIX_BITS,
-            true,
-            MEMOIZE_OUTER_SCAN,
-            INNER_SCAN_ALGORITHM,
-            SMEM_CONFIG,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        DescendingBlockRadixRank;
-
-    /// BlockExchange utility type for keys
-    typedef BlockExchange<Key, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
-
-    /// BlockExchange utility type for values
-    typedef BlockExchange<Value, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union
-        {
-            typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
-            typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
-            typename BlockExchangeKeys::TempStorage        exchange_keys;
-            typename BlockExchangeValues::TempStorage      exchange_values;
-        };
-    };
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-    /// Rank keys (specialized for ascending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<false> is_descending)
-    {
-        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// Rank keys (specialized for descending sort)
-    __device__ __forceinline__ void RankKeys(
-        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        int             begin_bit,
-        int             pass_bits,
-        Int2Type<true>  is_descending)
-    {
-        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
-            unsigned_keys,
-            ranks,
-            begin_bit,
-            pass_bits);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        Value           (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> is_keys_only,
-        Int2Type<true>  is_blocked)
-    {
-        __syncthreads();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
-    __device__ __forceinline__ void ExchangeValues(
-        Value           (&values)[ITEMS_PER_THREAD],
-        int             (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<false> is_keys_only,
-        Int2Type<false> is_blocked)
-    {
-        __syncthreads();
-
-        // Exchange values through shared memory in blocked arrangement
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-    }
-
-    /// ExchangeValues (specialized for keys-only sort)
-    template <int IS_BLOCKED>
-    __device__ __forceinline__ void ExchangeValues(
-        Value                   (&values)[ITEMS_PER_THREAD],
-        int                     (&ranks)[ITEMS_PER_THREAD],
-        Int2Type<true>          is_keys_only,
-        Int2Type<IS_BLOCKED>    is_blocked)
-    {}
-
-    /// Sort blocked arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlocked(
-        Key                     (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        Value                   (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            // Quit if done
-            if (begin_bit >= end_bit) break;
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-    /// Sort blocked -> striped arrangement
-    template <int DESCENDING, int KEYS_ONLY>
-    __device__ __forceinline__ void SortBlockedToStriped(
-        Key                     (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
-        Value                   (&values)[ITEMS_PER_THREAD],        ///< Values to sort
-        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
-        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
-        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
-    {
-        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
-            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
-
-        // Twiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-        }
-
-        // Radix sorting passes
-        while (true)
-        {
-            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
-
-            // Rank the blocked keys
-            int ranks[ITEMS_PER_THREAD];
-            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
-            begin_bit += RADIX_BITS;
-
-            __syncthreads();
-
-            // Check if this is the last pass
-            if (begin_bit >= end_bit)
-            {
-                // Last pass exchanges keys through shared memory in striped arrangement
-                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
-
-                // Last pass exchanges through shared memory in striped arrangement
-                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
-
-                // Quit
-                break;
-            }
-
-            // Exchange keys through shared memory in blocked arrangement
-            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
-
-            // Exchange values through shared memory in blocked arrangement
-            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
-
-            __syncthreads();
-        }
-
-        // Untwiddle bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-        }
-    }
-
-
-
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockRadixSort(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangements)
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     */
-    __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sort(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-    /**
-     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).Sort(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
-     * The corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     */
-    __device__ __forceinline__ void SortDescending(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescending(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Sorting (blocked arrangement -> striped arrangement)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
-     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     ...
-     *
-     *     // Collectively sort the keys
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        NullType values[ITEMS_PER_THREAD];
-
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    /**
-     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
-     *
-     * \par
-     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
-     *   more than one tile of values, simply perform a key-value sort of the keys paired
-     *   with a temporary value array that enumerates the key indices.  The reordered indices
-     *   can then be used as a gather-vector for exchanging other associated tile data through
-     *   shared memory.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sort of 512 integer keys and values that
-     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
-     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
-     *
-     *     // Allocate shared memory for BlockRadixSort
-     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_keys[4];
-     *     int thread_values[4];
-     *     ...
-     *
-     *     // Collectively sort the keys and values among block threads
-     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_keys across the block of threads is
-     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
-     * corresponding output \p thread_keys in those threads will be
-     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
-     *
-     */
-    __device__ __forceinline__ void SortDescendingBlockedToStriped(
-        Key     (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
-        Value   (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
-        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
-        int     end_bit     = sizeof(Key) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-    {
-        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
-    }
-
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_block_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh b/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
deleted file mode 100644
index cf11f2d04..000000000
--- a/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh
+++ /dev/null
@@ -1,149 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
- */
-
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_arch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
- * \ingroup BlockModule
- *
- * \par Overview
- * This type facilitates a shared memory usage pattern where a block of CUDA
- * threads places elements into shared memory and then reduces the active
- * parallelism to one "raking" warp of threads for serially aggregating consecutive
- * sequences of shared items.  Padding is inserted to eliminate bank conflicts
- * (for most data types).
- *
- * \tparam T                        The data type to be exchanged.
- * \tparam BLOCK_THREADS            The thread block size in threads.
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- */
-template <
-    typename    T,
-    int         BLOCK_THREADS,
-    int         PTX_ARCH = CUB_PTX_ARCH>
-struct BlockRakingLayout
-{
-    //---------------------------------------------------------------------
-    // Constants and type definitions
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// The total number of elements that need to be cooperatively reduced
-        SHARED_ELEMENTS = BLOCK_THREADS,
-
-        /// Maximum number of warp-synchronous raking threads
-        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Number of raking elements per warp-synchronous raking thread (rounded up)
-        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
-
-        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
-        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
-
-        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
-        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
-
-        /// Degree of bank conflicts (e.g., 4-way)
-        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
-            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
-            1,
-
-        /// Pad each segment length with one element if degree of bank conflicts is greater than 4-way (heuristic)
-        SEGMENT_PADDING = (CONFLICT_DEGREE > CUB_PREFER_CONFLICT_OVER_PADDING(PTX_ARCH)) ? 1 : 0,
-//        SEGMENT_PADDING = (HAS_CONFLICTS) ? 1 : 0,
-
-        /// Total number of elements in the raking grid
-        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + SEGMENT_PADDING),
-
-        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
-        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
-    };
-
-
-    /**
-     * \brief Shared memory storage type
-     */
-    typedef T _TempStorage[BlockRakingLayout::GRID_ELEMENTS];
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /**
-     * \brief Returns the location for the calling thread to place data into the grid
-     */
-    static __device__ __forceinline__ T* PlacementPtr(
-        TempStorage &temp_storage,
-        int linear_tid)
-    {
-        // Offset for partial
-        unsigned int offset = linear_tid;
-
-        // Add in one padding element for every segment
-        if (SEGMENT_PADDING > 0)
-        {
-            offset += offset / SEGMENT_LENGTH;
-        }
-
-        // Incorporating a block of padding partials every shared memory segment
-        return temp_storage.Alias() + offset;
-    }
-
-
-    /**
-     * \brief Returns the location for the calling thread to begin sequential raking
-     */
-    static __device__ __forceinline__ T* RakingPtr(
-        TempStorage &temp_storage,
-        int linear_tid)
-    {
-        return temp_storage.Alias() + (linear_tid * (SEGMENT_LENGTH + SEGMENT_PADDING));
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_reduce.cuh b/thrust/system/cuda/detail/cub/block/block_reduce.cuh
deleted file mode 100644
index d77cd917d..000000000
--- a/thrust/system/cuda/detail/cub/block/block_reduce.cuh
+++ /dev/null
@@ -1,607 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_reduce_raking.cuh"
-#include "specializations/block_reduce_raking_commutative_only.cuh"
-#include "specializations/block_reduce_warp_reductions.cuh"
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA threadblock.
- */
-enum BlockReduceAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that only supports commutative
-     * reduction operators (true for most operations, e.g., addition).
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Threads in warps other than the first warp place
-     *    their partial reductions into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within the first
-     *    warp continue to accumulate by raking across segments of shared partial reductions
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
-     *   and is preferable when the reduction operator is commutative.  This variant
-     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
-
-
-    /**
-     * \par Overview
-     * An efficient "raking" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators. \blocked.
-     *
-     * \par
-     * Execution is comprised of three phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a
-     *    single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
-     *
-     * \par
-     * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant performs more communication than BLOCK_REDUCE_RAKING
-     *   and is only preferable when the reduction operator is non-commutative.  This variant
-     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
-     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
-     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
-     *   when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_RAKING,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
-     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
-     * operators.
-     *
-     * \par
-     * Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more
-     *    than one input each).  Each thread then places the partial reduction
-     *    of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
-     *    reduction within each warp.
-     * -# A propagation phase where the warp reduction outputs in each warp are
-     *    updated with the aggregate from each preceding warp.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
-     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
-     *   throughput across the GPU.  However turn-around latency may be lower and
-     *   thus useful when the GPU is under-occupied.
-     */
-    BLOCK_REDUCE_WARP_REDUCTIONS,
-};
-
-
-/******************************************************************************
- * Block reduce
- ******************************************************************************/
-
-/**
- * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being reduced
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - \rowmajor
- * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
- *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Very efficient (only one synchronization barrier).
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Summation (<b><em>vs.</em></b> generic reduction)
- *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
- *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
- * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockReduce}
- * \par
- * The code snippet below illustrates a sum reduction of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockReduce for a 1D block of 128 threads on type int
- *     typedef cub::BlockReduce<int, 128> BlockReduce;
- *
- *     // Allocate shared memory for BlockReduce
- *     __shared__ typename BlockReduce::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Compute the block-wide sum for thread0
- *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- *
- */
-template <
-    typename                T,
-    int                     BLOCK_DIM_X,
-    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
-    int                     BLOCK_DIM_Y     = 1,
-    int                     BLOCK_DIM_Z     = 1,
-    int                     PTX_ARCH        = CUB_PTX_ARCH>
-class BlockReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
-    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
-
-    /// Internal specialization type
-    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
-        WarpReductions,
-        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
-            RakingCommutativeOnly,
-            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
-
-    /// Shared memory storage layout type for BlockReduce
-    typedef typename InternalBlockReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-public:
-
-    /// \smemstorage{BlockReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                      ///< [in] Calling thread's input
-        ReductionOp     reduction_op)               ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
-        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        // Reduce partials
-        T partial = ThreadReduce(inputs, reduction_op);
-        return Reduce(partial, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid) thread_data = ...
-     *
-     *     // Compute the block-wide max for thread0
-     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
-     *
-     * \endcode
-     *
-     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        ReductionOp         reduction_op,           ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item
-     *     int thread_data;
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input)                      ///< [in] Calling thread's input
-    {
-        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
-    }
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
-     *
-     * \endcode
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ T Sum(
-        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
-    {
-        // Reduce partials
-        T partial = ThreadReduce(inputs, cub::Sum());
-        return Sum(partial);
-    }
-
-
-    /**
-     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
-     *
-     * \par
-     * - The return value is undefined in threads other than thread<sub>0</sub>.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
-     *
-     * __global__ void ExampleKernel(int num_valid, ...)
-     * {
-     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
-     *     typedef cub::BlockReduce<int, 128> BlockReduce;
-     *
-     *     // Allocate shared memory for BlockReduce
-     *     __shared__ typename BlockReduce::TempStorage temp_storage;
-     *
-     *     // Each thread obtains an input item (up to num_items)
-     *     int thread_data;
-     *     if (threadIdx.x < num_valid)
-     *         thread_data = ...
-     *
-     *     // Compute the block-wide sum for thread0
-     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
-     *
-     * \endcode
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T   input,                  ///< [in] Calling thread's input
-        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
-    {
-        // Determine if we scan skip bounds checking
-        if (num_valid >= BLOCK_THREADS)
-        {
-            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
-        }
-        else
-        {
-            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
-        }
-    }
-
-
-    //@}  end member group
-};
-
-/**
- * \example example_block_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_scan.cuh b/thrust/system/cuda/detail/cub/block/block_scan.cuh
deleted file mode 100644
index 2908a3299..000000000
--- a/thrust/system/cuda/detail/cub/block/block_scan.cuh
+++ /dev/null
@@ -1,2327 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "specializations/block_scan_raking.cuh"
-#include "specializations/block_scan_warp_scans.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_ptx.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/******************************************************************************
- * Scan utility types
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Reduce-value-by-ID scan operator
- */
-template <typename ReductionOp>     ///< Wrapped reduction operator type
-struct ReduceByKeyOp
-{
-    ReductionOp op;                 ///< Wrapped reduction operator
-
-    /// Constructor
-    __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePair>
-    __device__ __forceinline__ KeyValuePair operator()(
-        const KeyValuePair &first,
-        const KeyValuePair &second)
-    {
-        KeyValuePair retval;
-
-        retval.value = (second.key != first.key) ?
-                second.value :                      // The second value is for a different ID, return only that value
-                op(first.value, second.value);      // The values are for the same ID so reduce them
-
-        retval.key = second.key;
-        return retval;
-    }
-};
-
-
-
-/**
- * Segmented scan operator
- */
-template <typename ReductionOp>     ///< Wrapped reduction operator type
-struct SegmentedOp
-{
-    ReductionOp op;                 ///< Wrapped reduction operator
-
-    /// Constructor
-    __device__ __forceinline__ SegmentedOp(ReductionOp op) : op(op) {}
-
-    /// Scan operator
-    template <typename KeyValuePair>
-    __device__ __forceinline__ KeyValuePair operator()(
-        const KeyValuePair &first,
-        const KeyValuePair &second)
-    {
-        if (second.key) {
-            KeyValuePair retval;
-            retval.value = second.value;
-            retval.key = first.key + second.key;
-            return retval;
-        } else {
-            KeyValuePair retval;
-            retval.value = op(first.value, second.value);
-            retval.key = first.key + second.key;
-            return ;
-        }
-    }
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-/**
- * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
- */
-enum BlockScanAlgorithm
-{
-
-    /**
-     * \par Overview
-     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
-     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
-     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_raking.png
-     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer longer turnaround latencies when the
-     *   GPU is under-occupied, it can often provide higher overall throughput
-     *   across the GPU when suitably occupied.
-     */
-    BLOCK_SCAN_RAKING,
-
-
-    /**
-     * \par Overview
-     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
-     * the expense of higher register pressure.  Raking threads preserve their
-     * "upsweep" segment of values in registers while performing warp-synchronous
-     * scan, allowing the "downsweep" not to re-read them from shared memory.
-     */
-    BLOCK_SCAN_RAKING_MEMOIZE,
-
-
-    /**
-     * \par Overview
-     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
-     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
-     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
-     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
-     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
-     *
-     * \par
-     * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
-     *
-     * \par Performance Considerations
-     * - Although this variant may suffer lower overall throughput across the
-     *   GPU because due to a heavy reliance on inefficient warpscans, it can
-     *   often provide lower turnaround latencies when the GPU is under-occupied.
-     */
-    BLOCK_SCAN_WARP_SCANS,
-};
-
-
-/******************************************************************************
- * Block scan
- ******************************************************************************/
-
-/**
- * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                Data type being scanned
- * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
- * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
- * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - \rowmajor
- * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
- *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
- *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
- *
- * \par Performance Considerations
- * - \granularity
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Invokes a minimal number of minimal block-wide synchronization barriers (only
- *   one or two depending on algorithm selection)
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
- *   - \blocksize
- * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
- *
- * \par A Simple Example
- * \blockcollective{BlockScan}
- * \par
- * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
- * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
- * where each thread owns 4 consecutive items.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize BlockScan for a 1D block of 128 threads on type int
- *     typedef cub::BlockScan<int, 128> BlockScan;
- *
- *     // Allocate shared memory for BlockScan
- *     __shared__ typename BlockScan::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Collectively compute the block-wide exclusive prefix sum
- *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is
- * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
- * The corresponding output \p thread_data in those threads will be
- * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
-    int                 BLOCK_DIM_Y     = 1,
-    int                 BLOCK_DIM_Z     = 1,
-    int                 PTX_ARCH        = CUB_PTX_ARCH>
-class BlockScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /**
-     * Ensure the template parameterization meets the requirements of the
-     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
-     * cannot be used with threadblock sizes not a multiple of the
-     * architectural warp size.
-     */
-    static const BlockScanAlgorithm SAFE_ALGORITHM =
-        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
-            BLOCK_SCAN_RAKING :
-            ALGORITHM;
-
-    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
-    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
-
-    /// Define the delegate type for the desired algorithm
-    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
-        WarpScans,
-        Raking>::Type InternalBlockScan;
-
-    /// Shared memory storage layout type for BlockScan
-    typedef typename InternalBlockScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-public:
-
-    /// \smemstorage{BlockScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        ExclusiveScan(input, output, ZeroInitialize<T>(), cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        ExclusiveScan(input, output, ZeroInitialize<T>(), cub::Sum(), block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).ExclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
-     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.  Furthermore,
-     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        ExclusiveScan(input, output, ZeroInitialize<T>(), cub::Sum(), block_aggregate, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \identityzero
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide exclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
-     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.  Furthermore,
-     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                       &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        Sum scan_op;
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               identity,                       ///< [in] Identity value
-        ScanOp          scan_op)                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
-     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T                       identity,                       ///< [in] Identity value
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, block_aggregate, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group        // Inclusive prefix sums
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 identity,                    ///< [in] Identity value
-        ScanOp            scan_op)                      ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide exclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                 identity,                    ///< [in] Identity value
-        ScanOp            scan_op,                      ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an exclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide exclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).ExclusiveScan(
-     *             thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
-     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
-     * scan, etc.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                       identity,                       ///< [in] Identity value
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, identity, scan_op, block_aggregate, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    //@}  end member group
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-#if 0
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (identityless, single datum per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-
-#endif // #if 0
-
-    /******************************************************************//**
-     * \name Exclusive prefix scan operations (identityless, multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp            scan_op)                      ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-    }
-
-
-    /**
-     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     * The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     * The functor will be invoked by the first warp of threads in the block, however only the return value from
-     * <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                      ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,             ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        // Reduce consecutive thread items in registers
-        T thread_partial = ThreadReduce(input, scan_op);
-
-        // Exclusive threadblock-scan
-        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
-
-        // Exclusive scan in registers with prefix
-        ThreadScanExclusive(input, output, scan_op, thread_partial);
-    }
-
-
-    //@}  end member group
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
-    {
-        InclusiveScan(input, output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
-     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
-     *
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InclusiveScan(input, output, cub::Sum(), block_aggregate);
-    }
-
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).InclusiveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
-     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.  Furthermore,
-     * the value \p 128 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InclusiveScan(input, output, cub::Sum(), block_aggregate, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sum operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0]);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix sum
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be
-     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
-     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <int ITEMS_PER_THREAD>
-    __device__ __forceinline__ void InclusiveSum(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial, block_aggregate);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix sum over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
-     * across 128 threads where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total += block_aggregate;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide inclusive prefix sum
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).IncluisveSum(
-     *             thread_data, thread_data, block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
-     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
-     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.  Furthermore,
-     * the value \p 512 will be stored in \p block_aggregate for all threads after each scan.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int ITEMS_PER_THREAD,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveSum(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveSum(input[0], output[0], block_aggregate, block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            Sum scan_op;
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveSum(thread_partial, thread_partial, block_aggregate, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial);
-        }
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
-     * are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain input item for each thread
-     *     int thread_data;
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \rowmajor
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data = d_data[block_offset];
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         d_data[block_offset] = thread_data;
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
-     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 126 in all threads after the first scan, assigned \p 254 after the second
-     * scan, etc.
-     *
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate, block_prefix_callback_op);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scan operations (multiple data per thread)
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
-     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
-     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
-     * where each thread owns 4 consecutive items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize BlockScan for a 1D block of 128 threads on type int
-     *     typedef cub::BlockScan<int, 128> BlockScan;
-     *
-     *     // Allocate shared memory for BlockScan
-     *     __shared__ typename BlockScan::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Collectively compute the block-wide inclusive prefix max scan
-     *     int block_aggregate;
-     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is
-     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
-     * The corresponding output \p thread_data in those threads will be
-     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
-     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
-     *
-     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename         ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
-        }
-    }
-
-
-    /**
-     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-     *
-     * \par
-     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
-     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
-     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
-     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
-     * - Supports non-commutative scan operators.
-     * - \blocked
-     * - \granularity
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a single thread block that progressively
-     * computes an inclusive prefix max scan over multiple "tiles" of input using a
-     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
-     * of 128 integer items that are partitioned across 128 threads.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
-     *
-     * // A stateful callback functor that maintains a running prefix to be applied
-     * // during consecutive scan operations.
-     * struct BlockPrefixCallbackOp
-     * {
-     *     // Running prefix
-     *     int running_total;
-     *
-     *     // Constructor
-     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
-     *
-     *     // Callback operator to be entered by the first warp of threads in the block.
-     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
-     *     __device__ int operator()(int block_aggregate)
-     *     {
-     *         int old_prefix = running_total;
-     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
-     *         return old_prefix;
-     *     }
-     * };
-     *
-     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
-     * {
-     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
-     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_TRANSPOSE> BlockStore;
-     *     typedef cub::BlockScan<int, 128>                             BlockScan;
-     *
-     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
-     *     __shared__ union {
-     *         typename BlockLoad::TempStorage     load;
-     *         typename BlockScan::TempStorage     scan;
-     *         typename BlockStore::TempStorage    store;
-     *     } temp_storage;
-     *
-     *     // Initialize running total
-     *     BlockPrefixCallbackOp prefix_op(0);
-     *
-     *     // Have the block iterate over segments of items
-     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
-     *     {
-     *         // Load a segment of consecutive items that are blocked across threads
-     *         int thread_data[4];
-     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *
-     *         // Collectively compute the block-wide inclusive prefix max scan
-     *         int block_aggregate;
-     *         BlockScan(temp_storage.scan).InclusiveScan(
-     *             thread_data, thread_data, cub::Max(), block_aggregate, prefix_op);
-     *         __syncthreads();
-     *
-     *         // Store scanned items to output segment
-     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
-     *         __syncthreads();
-     *     }
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
-     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
-     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.  Furthermore,
-     * \p block_aggregate will be assigned \p 510 in all threads after the first scan, assigned \p 1022 after the second
-     * scan, etc.
-     *
-     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
-     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.) type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
-     */
-    template <
-        int             ITEMS_PER_THREAD,
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
-        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        T                       &block_aggregate,               ///< [out] block-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to all inputs.
-    {
-        if (ITEMS_PER_THREAD == 1)
-        {
-            InclusiveScan(input[0], output[0], scan_op, block_aggregate, block_prefix_callback_op);
-        }
-        else
-        {
-            // Reduce consecutive thread items in registers
-            T thread_partial = ThreadReduce(input, scan_op);
-
-            // Exclusive threadblock-scan
-            ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate, block_prefix_callback_op);
-
-            // Inclusive scan in registers with prefix
-            ThreadScanInclusive(input, output, scan_op, thread_partial);
-        }
-    }
-
-    //@}  end member group
-
-
-};
-
-/**
- * \example example_block_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_shift.cuh b/thrust/system/cuda/detail/cub/block/block_shift.cuh
deleted file mode 100644
index 3cd09222a..000000000
--- a/thrust/system/cuda/detail/cub/block/block_shift.cuh
+++ /dev/null
@@ -1,325 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockShift class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../util_arch.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief The BlockShift class provides [<em>collective</em>](index.html#sec0) methods for shifting data partitioned across a CUDA thread block. ![](transpose_logo.png)
- * \ingroup BlockModule
- *
- * \tparam T                    The data type to be exchanged.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * It is commonplace for blocks of threads to rearrange data items between
- * threads.  The BlockShift abstraction allows threads to efficiently shift items
- * either (a) up to their successor or (b) down to their predecessor.
- *
- */
-template <
-    typename            T,
-    int                 BLOCK_DIM_X,
-    int                 BLOCK_DIM_Y         = 1,
-    int                 BLOCK_DIM_Z         = 1,
-    int                 PTX_ARCH            = CUB_PTX_ARCH>
-class BlockShift
-{
-private:
-
-    /******************************************************************************
-     * Constants
-     ******************************************************************************/
-
-    enum
-    {
-        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
-        WARP_THREADS                = 1 << LOG_WARP_THREADS,
-        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Shared memory storage layout type
-    typedef typename If<(PTX_ARCH >= 300),
-        T[WARPS],                                   // Kepler+ only needs smem to share between warps
-        T[BLOCK_THREADS] >::Type _TempStorage;
-
-public:
-
-    /// \smemstorage{BlockShift}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-private:
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-    int lane_id;
-    int warp_id;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-public:
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockShift()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockShift(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Shift exchanges
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Each thread obtains the \p input provided by its predecessor.  The first thread receives \p block_prefix.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Up(
-        T input,            ///< [in] Input item
-        T &output,          ///< [out] Output item
-        T block_prefix)     ///< [in] Prefix item to be provided to <em>thread</em><sub>0</sub>
-    {
-#if CUB_PTX_ARCH >= 300
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage[warp_id] = input;
-
-        __syncthreads();
-
-        output = ShuffleUp(input, 1);
-        if (lane_id == 0)
-        {
-            output = (linear_tid == 0) ?
-                block_prefix :
-                temp_storage[warp_id - 1];
-        }
-#else
-        temp_storage[linear_tid] = input;
-
-        __syncthreads();
-
-        output = (linear_tid == 0) ?
-            block_prefix :
-            temp_storage[linear_tid - 1];
-#endif
-    }
-
-
-    /**
-     * \brief Each thread receives the \p input provided by its predecessor.  The first thread receives \p block_prefix.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Up(
-        T input,            ///< [in] Input item
-        T &output,          ///< [out] Output item
-        T block_prefix,     ///< [in] Prefix item to be provided to <em>thread</em><sub>0</sub>
-        T &block_suffix)    ///< [out] Suffix item shifted out by the <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub> to be provided to all threads
-    {
-#if CUB_PTX_ARCH >= 300
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage[warp_id] = input;
-
-        __syncthreads();
-
-        output = ShuffleUp(input, 1);
-        if (lane_id == 0)
-        {
-            output = (linear_tid == 0) ?
-                block_prefix :
-                temp_storage[warp_id - 1];
-        }
-        block_suffix = temp_storage[WARPS - 1];
-#else
-        temp_storage[linear_tid] = input;
-
-        __syncthreads();
-
-        output = (linear_tid == 0) ?
-            block_prefix :
-            temp_storage[linear_tid - 1];
-
-        block_suffix = temp_storage[BLOCK_THREADS - 1];
-#endif
-    }
-
-
-    /**
-     * \brief Each thread obtains the \p input provided by its successor.  The last thread receives \p block_suffix.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Down(
-        T input,            ///< [in] Input item
-        T &output,          ///< [out] Output item
-        T block_suffix)     ///< [in] Suffix item to be provided to <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>
-    {
-#if CUB_PTX_ARCH >= 300
-        if (lane_id == 0)
-            temp_storage[warp_id] = input;
-
-        __syncthreads();
-
-        output = ShuffleDown(input, 1);
-        if (lane_id == WARP_THREADS - 1)
-        {
-            output = (linear_tid == BLOCK_THREADS - 1) ?
-                block_suffix :
-                temp_storage[warp_id + 1];
-        }
-#else
-        temp_storage[linear_tid] = input;
-
-        __syncthreads();
-
-        output = (linear_tid == BLOCK_THREADS - 1) ?
-            block_suffix :
-            temp_storage[linear_tid + 1];
-#endif
-    }
-
-
-    /**
-     * \brief Each thread obtains the \p input provided by its successor.  The last thread receives \p block_suffix.  All threads receive the \p input provided by <em>thread</em><sub>0</sub>.
-     *
-     * \par
-     * - \smemreuse
-     */
-    __device__ __forceinline__ void Down(
-        T input,            ///< [in] Input item
-        T &output,          ///< [out] Output item
-        T block_suffix,     ///< [in] Suffix item to be provided to <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>
-        T &block_prefix)    ///< [out] Prefix item shifted out by the <em>thread</em><sub>0</sub> to be provided to all threads
-    {
-#if CUB_PTX_ARCH >= 300
-        if (lane_id == 0)
-            temp_storage[warp_id] = input;
-
-        __syncthreads();
-
-        output = ShuffleDown(input, 1);
-        if (lane_id == WARP_THREADS - 1)
-        {
-            output = (linear_tid == BLOCK_THREADS - 1) ?
-                block_suffix :
-                temp_storage[warp_id + 1];
-        }
-#else
-        temp_storage[linear_tid] = input;
-
-        __syncthreads();
-
-        output = (linear_tid == BLOCK_THREADS - 1) ?
-            block_suffix :
-            temp_storage[linear_tid + 1];
-#endif
-
-        block_prefix = temp_storage[0];
-    }
-
-    //@}  end member group
-
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/block_store.cuh b/thrust/system/cuda/detail/cub/block/block_store.cuh
deleted file mode 100644
index 066541ada..000000000
--- a/thrust/system/cuda/detail/cub/block/block_store.cuh
+++ /dev/null
@@ -1,892 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Operations for writing linear segments of data from the CUDA thread block
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_exchange.cuh"
-#include "../util_ptx.cuh"
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-/******************************************************************//**
- * \name Blocked arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
- *
- * \blocked
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectBlocked(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    // Store directly in thread-blocked order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
-        {
-            block_itr[(linear_tid * ITEMS_PER_THREAD) + ITEM] = items[ITEM];
-        }
-    }
-}
-
-
-/**
- * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
- *
- * \blocked
- *
- * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
- * which is the default starting offset returned by \p cudaMalloc()
- *
- * \par
- * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
- *   - \p ITEMS_PER_THREAD is odd
- *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- *
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD>
-__device__ __forceinline__ void StoreDirectBlockedVectorized(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    T                   *block_ptr,                 ///< [in] Input pointer for storing from
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    enum
-    {
-        // Maximum CUDA vector size is 4 elements
-        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
-
-        // Vector size must be a power of two and an even divisor of the items per thread
-        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
-            MAX_VEC_SIZE :
-            1,
-
-        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
-    };
-
-    // Vector type
-    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
-
-    // Alias global pointer
-    Vector *block_ptr_vectors = reinterpret_cast<Vector *>(block_ptr);
-
-    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
-    Vector raw_vector[VECTORS_PER_THREAD];
-    T *raw_items = reinterpret_cast<T*>(raw_vector);
-
-    // Copy
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        raw_items[ITEM] = items[ITEM];
-    }
-
-    // Direct-store using vector types
-    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-{
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \striped
- *
- * \tparam BLOCK_THREADS        The thread block size in threads
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    int                 BLOCK_THREADS,
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    // Store directly in striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
-        {
-            block_itr[(ITEM * BLOCK_THREADS) + linear_tid] = items[ITEM];
-        }
-    }
-}
-
-
-
-//@}  end member group
-/******************************************************************//**
- * \name Warp-striped arrangement I/O (direct)
- *********************************************************************/
-//@{
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-    }
-}
-
-
-/**
- * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
- *
- * \warpstriped
- *
- * \par Usage Considerations
- * The number of threads in the thread block must be a multiple of the architecture's warp size.
- *
- * \tparam T                    <b>[inferred]</b> The data type to store.
- * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
- * \tparam OutputIterator       <b>[inferred]</b> The random-access iterator type for output \iterator.
- */
-template <
-    typename            T,
-    int                 ITEMS_PER_THREAD,
-    typename            OutputIterator>
-__device__ __forceinline__ void StoreDirectWarpStriped(
-    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
-    OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-    int                 valid_items)                ///< [in] Number of valid items to write
-{
-    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
-    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
-    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
-
-    // Store directly in warp-striped order
-    #pragma unroll
-    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-    {
-        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
-        {
-            block_itr[warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
-        }
-    }
-}
-
-
-//@}  end member group
-
-
-/** @} */       // end group UtilIo
-
-
-//-----------------------------------------------------------------------------
-// Generic BlockStore abstraction
-//-----------------------------------------------------------------------------
-
-/**
- * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
- */
-enum BlockStoreAlgorithm
-{
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
-     * directly to memory.  The thread block writes items in a parallel "raking" fashion:
-     * thread<sub><em>i</em></sub> writes the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) decreases as the
-     *   access stride between threads increases (i.e., the number items per thread).
-     */
-    BLOCK_STORE_DIRECT,
-
-    /**
-     * \par Overview
-     *
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
-     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
-     * The thread block writes items in a parallel "raking" fashion: thread<sub><em>i</em></sub> uses vector stores to
-     * write the <em>i</em><sup>th</sup> segment of consecutive elements.
-     *
-     * For example, <tt>st.global.v4.s32</tt> instructions will be generated when \p T = \p int and \p ITEMS_PER_THREAD > 4.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high until the the
-     *   access stride between threads (i.e., the number items per thread) exceeds the
-     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
-     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
-     *   - \p ITEMS_PER_THREAD is odd
-     *   - The \p OutputIterator is not a simple pointer type
-     *   - The block output offset is not quadword-aligned
-     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
-     */
-    BLOCK_STORE_VECTORIZE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed into a [<em>striped arrangement</em>](index.html#sec5sec3)
-     * which is then written to memory.  More specifically, cub::BlockExchange
-     * used to locally reorder the items into a
-     * [<em>striped arrangement</em>](index.html#sec5sec3), after which the
-     * thread block writes items in a parallel "strip-mining" fashion: consecutive
-     * items owned by thread<sub><em>i</em></sub> are written to memory with
-     * stride \p BLOCK_THREADS between them.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_TRANSPOSE,
-
-    /**
-     * \par Overview
-     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
-     * transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
-     * which is then written to memory.  More specifically, cub::BlockExchange used
-     * to locally reorder the items into a
-     * [<em>warp-striped arrangement</em>](index.html#sec5sec3), after which
-     * each warp writes its own contiguous segment in a parallel "strip-mining" fashion:
-     * consecutive items owned by lane<sub><em>i</em></sub> are written to memory
-     * with stride \p WARP_THREADS between them.
-     *
-     * \par Performance Considerations
-     * - The utilization of memory transactions (coalescing) remains high regardless
-     *   of items written per thread.
-     * - The local reordering incurs slightly longer latencies and throughput than the
-     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
-     */
-    BLOCK_STORE_WARP_TRANSPOSE,
-};
-
-
-/**
- * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
- * \ingroup BlockModule
- * \ingroup UtilIo
- *
- * \tparam OutputIterator       The input iterator type \iterator.
- * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
- * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
- * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
- * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
- * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
- * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
- * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - The BlockStore class provides a single data movement abstraction that can be specialized
- *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
- *   performance policies for different architectures, data types, granularity sizes, etc.
- * - BlockStore can be optionally specialized by different data movement strategies:
- *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
- *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      of data is written directly to memory using CUDA's built-in vectorized stores as a
- *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
- *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
- *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
- * - \rowmajor
- *
- * \par A Simple Example
- * \blockcollective{BlockStore}
- * \par
- * The code snippet below illustrates the storing of a "blocked" arrangement
- * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
- * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
- * meaning items are locally reordered among threads so that memory references will be
- * efficiently coalesced using a warp-striped access pattern.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
- *
- * __global__ void ExampleKernel(int *d_data, ...)
- * {
- *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
- *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
- *
- *     // Allocate shared memory for BlockStore
- *     __shared__ typename BlockStore::TempStorage temp_storage;
- *
- *     // Obtain a segment of consecutive items that are blocked across threads
- *     int thread_data[4];
- *     ...
- *
- *     // Store items to linear memory
- *     int thread_data[4];
- *     BlockStore(temp_storage).Store(d_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of \p thread_data across the block of threads is
- * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
- * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
- *
- */
-template <
-    typename                OutputIterator,
-    int                     BLOCK_DIM_X,
-    int                     ITEMS_PER_THREAD,
-    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
-    bool                    WARP_TIME_SLICING   = false,
-    int                     BLOCK_DIM_Y         = 1,
-    int                     BLOCK_DIM_Z         = 1,
-    int                     PTX_ARCH            = CUB_PTX_ARCH>
-class BlockStore
-{
-private:
-    /******************************************************************************
-     * Constants and typed definitions
-     ******************************************************************************/
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<OutputIterator>::value_type T;
-
-
-    /******************************************************************************
-     * Algorithmic variants
-     ******************************************************************************/
-
-    /// Store helper
-    template <BlockStoreAlgorithm _POLICY, int DUMMY>
-    struct StoreInternal;
-
-
-    /**
-     * BLOCK_STORE_DIRECT specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_VECTORIZE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
-    {
-        /// Shared memory storage layout type
-        typedef NullType TempStorage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
-        __device__ __forceinline__ void Store(
-            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
-        }
-
-        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
-        template <typename _OutputIterator>
-        __device__ __forceinline__ void Store(
-            _OutputIterator     block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
-    {
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIterator    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToStriped(items);
-            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-
-    /**
-     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
-     */
-    template <int DUMMY>
-    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
-    {
-        enum
-        {
-            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
-        };
-
-        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
-        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
-
-        // BlockExchange utility type for keys
-        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, WARP_TIME_SLICING, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
-
-        /// Shared memory storage layout type
-        typedef typename BlockExchange::TempStorage _TempStorage;
-
-        /// Alias wrapper allowing storage to be unioned
-        struct TempStorage : Uninitialized<_TempStorage> {};
-
-        /// Thread reference to shared storage
-        _TempStorage &temp_storage;
-
-        /// Linear thread-id
-        int linear_tid;
-
-        /// Constructor
-        __device__ __forceinline__ StoreInternal(
-            TempStorage &temp_storage,
-            int linear_tid)
-        :
-            temp_storage(temp_storage.Alias()),
-            linear_tid(linear_tid)
-        {}
-
-        /// Store items into a linear segment of memory
-        __device__ __forceinline__ void Store(
-            OutputIterator    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items);
-        }
-
-        /// Store items into a linear segment of memory, guarded by range
-        __device__ __forceinline__ void Store(
-            OutputIterator    block_itr,                  ///< [in] The thread block's base output iterator for storing to
-            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-            int                 valid_items)                ///< [in] Number of valid items to write
-        {
-            BlockExchange(temp_storage).BlockedToWarpStriped(items);
-            StoreDirectWarpStriped(linear_tid, block_itr, items, valid_items);
-        }
-    };
-
-    /******************************************************************************
-     * Type definitions
-     ******************************************************************************/
-
-    /// Internal load implementation to use
-    typedef StoreInternal<ALGORITHM, 0> InternalStore;
-
-
-    /// Shared memory storage layout type
-    typedef typename InternalStore::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Internal storage allocator
-    __device__ __forceinline__ _TempStorage& PrivateStorage()
-    {
-        __shared__ _TempStorage private_storage;
-        return private_storage;
-    }
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Thread reference to shared storage
-    _TempStorage &temp_storage;
-
-    /// Linear thread-id
-    int linear_tid;
-
-public:
-
-
-    /// \smemstorage{BlockStore}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore()
-    :
-        temp_storage(PrivateStorage()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.
-     */
-    __device__ __forceinline__ BlockStore(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data movement
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Store items into a linear segment of memory.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
-     *
-     */
-    __device__ __forceinline__ void Store(
-        OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
-    }
-
-    /**
-     * \brief Store items into a linear segment of memory, guarded by range.
-     *
-     * \par
-     * - \blocked
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
-     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
-     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
-     * meaning items are locally reordered among threads so that memory references will be
-     * efficiently coalesced using a warp-striped access pattern.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
-     * {
-     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
-     *     typedef cub::BlockStore<int*, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
-     *
-     *     // Allocate shared memory for BlockStore
-     *     __shared__ typename BlockStore::TempStorage temp_storage;
-     *
-     *     // Obtain a segment of consecutive items that are blocked across threads
-     *     int thread_data[4];
-     *     ...
-     *
-     *     // Store items to linear memory
-     *     int thread_data[4];
-     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of \p thread_data across the block of threads is
-     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
-     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
-     * only the first two threads being unmasked to store portions of valid data.
-     *
-     */
-    __device__ __forceinline__ void Store(
-        OutputIterator      block_itr,                  ///< [in] The thread block's base output iterator for storing to
-        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
-        int                 valid_items)                ///< [in] Number of valid items to write
-    {
-        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
deleted file mode 100644
index ec4159ee2..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh
+++ /dev/null
@@ -1,82 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <int BINS>
-struct BlockHistogramAtomic
-{
-    /// Shared memory storage layout type
-    struct TempStorage {};
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramAtomic(
-        TempStorage &temp_storage)
-    {}
-
-
-    /// Composite data onto an existing histogram
-    template <
-        typename            T,
-        typename            HistoCounter,
-        int                 ITEMS_PER_THREAD>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        // Update histogram
-        #pragma unroll
-        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-        {
-              atomicAdd(histogram + items[i], 1);
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
deleted file mode 100644
index 12766ae56..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-
-#pragma once
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
- */
-template <
-    typename    T,                  ///< Sample type
-    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
-    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
-    int         BINS,               ///< The number of bins into which histogram samples may fall
-    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
-struct BlockHistogramSort
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<
-            T,
-            BLOCK_DIM_X,
-            ITEMS_PER_THREAD,
-            NullType,
-            4,
-            (PTX_ARCH >= 350) ? true : false,
-            BLOCK_SCAN_WARP_SCANS,
-            (PTX_ARCH >= 350) ? cudaSharedMemBankSizeEightByte : cudaSharedMemBankSizeFourByte,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<
-            T,
-            BLOCK_DIM_X,
-            BLOCK_DIM_Y,
-            BLOCK_DIM_Z,
-            PTX_ARCH>
-        BlockDiscontinuityT;
-
-    /// Shared memory
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            unsigned int run_begin[BINS];
-            unsigned int run_end[BINS];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockHistogramSort(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const T &a, const T &b, unsigned int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    // Composite data onto an existing histogram
-    template <
-        typename            HistoCounter>
-    __device__ __forceinline__ void Composite(
-        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
-        HistoCounter        histogram[BINS])                 ///< [out] Reference to shared/global memory histogram
-    {
-        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
-
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        __syncthreads();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        int histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-        // Finish up with guarded initialization if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
-            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
-        }
-
-        __syncthreads();
-
-        int flags[ITEMS_PER_THREAD];    // unused
-
-        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
-
-        __syncthreads();
-
-        // Composite into histogram
-        histo_offset = 0;
-
-        #pragma unroll
-        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-        {
-            int thread_offset = histo_offset + linear_tid;
-            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-
-        // Finish up with guarded composition if necessary
-        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
-        {
-            int thread_offset = histo_offset + linear_tid;
-            HistoCounter count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
-            histogram[thread_offset] += count;
-        }
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
deleted file mode 100644
index 3bddce65d..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh
+++ /dev/null
@@ -1,247 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../block/block_raking_layout.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
- *
- * Supports non-commutative binary reduction operators.  Unlike commutative
- * reduction operators (e.g., addition), the application of a non-commutative
- * reduction operator (e.g, string concatenation) across a sequence of inputs must
- * honor the relative ordering of items and partial reductions when applying the
- * reduction operator.
- *
- * Compared to the implementation of BlockReduceRaking (which does not support
- * non-commutative operators), this implementation requires a few extra
- * rounds of inter-thread communication.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRaking
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
-
-        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
-        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
-
-        /// Whether or not accesses into smem are unguarded
-        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
-
-    };
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
-        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded threadblock raking grid
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    template <bool FULL_TILE, typename ReductionOp, int ITERATION>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<ITERATION>         iteration)
-    {
-        // Update partial if addend is in range
-        if ((FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
-        {
-            T addend = raking_segment[ITERATION];
-            partial = reduction_op(partial, addend);
-        }
-        return RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
-    }
-
-    template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T RakingReduction(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           *raking_segment,
-        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SEGMENT_LENGTH>    iteration)
-    {
-        return partial;
-    }
-
-
-    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum reduction_op;
-
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
-
-                partial = WarpReduce(temp_storage.warp_storage).template Sum<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
-            partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE, SEGMENT_LENGTH>(
-                partial,
-                num_valid,
-                reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid.
-            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = raking_segment[0];
-
-                partial = RakingReduction<FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
-
-                partial = WarpReduce(temp_storage.warp_storage).template Reduce<FULL_TILE && RAKING_UNGUARDED, SEGMENT_LENGTH>(
-                    partial,
-                    num_valid,
-                    reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
deleted file mode 100644
index d0d736782..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh
+++ /dev/null
@@ -1,202 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "block_reduce_raking.cuh"
-#include "../../warp/warp_reduce.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceRakingCommutativeOnly
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
-    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Whether or not to use fall-back
-        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
-
-        /// Number of raking threads
-        RAKING_THREADS = WARP_THREADS,
-
-        /// Number of threads actually sharing items with the raking threads
-        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
-    };
-
-    ///  WarpReduce utility type
-    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
-
-    /// Layout type for padded thread block raking grid
-    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        union
-        {
-            struct
-            {
-                typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
-                typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded threadblock raking grid
-            };
-            typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
-            }
-        }
-
-        return partial;
-    }
-
-
-    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   partial,            ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        if (USE_FALLBACK || !FULL_TILE)
-        {
-            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
-        }
-        else
-        {
-            // Place partial into shared memory grid
-            if (linear_tid >= RAKING_THREADS)
-                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
-
-            __syncthreads();
-
-            // Reduce parallelism to one warp
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking reduction in grid
-                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-                partial = ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
-
-                // Warpscan
-                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
-            }
-        }
-
-        return partial;
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
deleted file mode 100644
index fb7ff6509..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh
+++ /dev/null
@@ -1,221 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
- */
-
-#pragma once
-
-#include "../../warp/warp_reduce.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA threadblock.  Supports non-commutative reduction operators.
- */
-template <
-    typename    T,              ///< Data type being reduced
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockReduceWarpReductions
-{
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// The logical warp size for warp reductions
-        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
-
-        /// Whether or not the logical warp size evenly divides the threadblock size
-        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
-    };
-
-
-    ///  WarpReduce utility type
-    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
-
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpReduce::TempStorage    warp_reduce[WARPS];                ///< Buffer for warp-synchronous scan
-        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
-        T                                   block_prefix;               ///< Shared prefix for the entire threadblock
-    };
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-    int warp_id;
-    int lane_id;
-
-
-    /// Constructor
-    __device__ __forceinline__ BlockReduceWarpReductions(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
-        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<SUCCESSOR_WARP>    successor_warp)
-    {
-        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
-        {
-            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
-            warp_aggregate = reduction_op(warp_aggregate, addend);
-        }
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
-    }
-
-    template <bool FULL_TILE, typename ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        Int2Type<WARPS>     successor_warp)
-    {
-        return warp_aggregate;
-    }
-
-
-    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T ApplyWarpAggregates(
-        ReductionOp         reduction_op,       ///< [in] Binary scan operator
-        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
-        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        // Share lane aggregates
-        if (lane_id == 0)
-        {
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-        }
-
-        __syncthreads();
-
-        // Update total aggregate in warp 0, lane 0
-        if (linear_tid == 0)
-        {
-            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
-        }
-
-        return warp_aggregate;
-    }
-
-
-    /// Computes a threadblock-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <bool FULL_TILE>
-    __device__ __forceinline__ T Sum(
-        T                   input,          ///< [in] Calling thread's input partial reductions
-        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-    {
-        cub::Sum        reduction_op;
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Sum<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-
-    /// Computes a threadblock-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
-    template <
-        bool                FULL_TILE,
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input partial reductions
-        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        unsigned int    warp_offset = warp_id * LOGICAL_WARP_SIZE;
-        unsigned int    warp_num_valid = (FULL_TILE && EVEN_WARP_MULTIPLE) ?
-                            LOGICAL_WARP_SIZE :
-                            (warp_offset < num_valid) ?
-                                num_valid - warp_offset :
-                                0;
-
-        // Warp reduction in every warp
-        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE), 1>(
-            input,
-            warp_num_valid,
-            reduction_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
-        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
deleted file mode 100644
index 699457422..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh
+++ /dev/null
@@ -1,756 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-
-/**
- * \file
- * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
- */
-
-#pragma once
-
-#include "../../util_ptx.cuh"
-#include "../../util_arch.cuh"
-#include "../../block/block_raking_layout.cuh"
-#include "../../thread/thread_reduce.cuh"
-#include "../../thread/thread_scan.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA threadblock.
- */
-template <
-    typename    T,              ///< Data type being scanned
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanRaking
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-    };
-
-    /// Layout type for padded threadblock raking grid
-    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
-
-    /// Constants
-    enum
-    {
-        /// Number of raking threads
-        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
-
-        /// Number of raking elements per warp synchronous raking thread
-        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
-
-        /// Cooperative work can be entirely warp synchronous
-        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
-        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded threadblock raking grid
-        T                                           block_aggregate;    ///< Block aggregate
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage    &temp_storage;
-    int             linear_tid;
-    T               cached_segment[SEGMENT_LENGTH];
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /// Templated reduction
-    template <int ITERATION, typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                  raking_ptr,         ///< [in] Input array
-        ScanOp              scan_op,            ///< [in] Binary reduction operator
-        T                   raking_partial,     ///< [in] Prefix to seed reduction with
-        Int2Type<ITERATION> iteration)
-    {
-        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
-        {
-            T addend = raking_ptr[ITERATION];
-            raking_partial = scan_op(raking_partial, addend);
-        }
-
-        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
-    }
-
-
-    /// Templated reduction (base case)
-    template <typename ScanOp>
-    __device__ __forceinline__ T GuardedReduce(
-        T*                          raking_ptr,        ///< [in] Input array
-        ScanOp                      scan_op,           ///< [in] Binary reduction operator
-        T                           raking_partial,    ///< [in] Prefix to seed reduction with
-        Int2Type<SEGMENT_LENGTH>    iteration)
-    {
-        return raking_partial;
-    }
-
-
-    /// Templated copy
-    template <int ITERATION>
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<ITERATION> iteration)
-    {
-        out[ITERATION] = in[ITERATION];
-        CopySegment(out, in, Int2Type<ITERATION + 1>());
-    }
-
- 
-    /// Templated copy (base case)
-    __device__ __forceinline__ void CopySegment(
-        T*                  out,            ///< [out] Out array
-        T*                  in,             ///< [in] Input array
-        Int2Type<SEGMENT_LENGTH> iteration)
-    {}
-
-
-    /// Performs upsweep raking reduction, returning the aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ T Upsweep(
-        ScanOp scan_op)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data into registers
-        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-
-        T raking_partial = cached_segment[0];
-
-        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
-    }
-
-
-    /// Performs exclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    /// Performs inclusive downsweep raking scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveDownsweep(
-        ScanOp          scan_op,
-        T               raking_partial,
-        bool            apply_prefix = true)
-    {
-        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
-
-        // Read data back into registers
-        if (!MEMOIZE)
-        {
-            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
-        }
-
-        ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
-
-        // Write data back to smem
-        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRaking(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, identity, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, identity, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-            }
-
-            __syncthreads();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, identity, scan_op);
-
-                // Broadcast aggregate to other threads
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial);
-            }
-
-            __syncthreads();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.
-    template <
-        typename        ScanOp,
-        typename        BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T                       identity,                       ///< [in] Identity value
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T exclusive_partial;
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_partial, identity, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            output = block_prefix_callback_op(block_aggregate);
-            output = WarpScan(temp_storage.warp_scan).Broadcast(output, 0);
-
-            // Update prefix with exclusive warpscan partial
-            if (linear_tid > 0)
-                output = scan_op(output, exclusive_partial);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, identity, scan_op);
-
-                // Broadcast aggregate to other lanes (through smem because we eventually want it in all threads)
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    ThreadStore<STORE_VOLATILE>(&temp_storage.block_aggregate, inclusive_partial);
-                block_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage.block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T prefix = block_prefix_callback_op(block_aggregate);
-                prefix = WarpScan(temp_storage.warp_scan).Broadcast(prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                if (linear_tid > 0)
-                    prefix = scan_op(prefix, exclusive_partial);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, prefix);
-            }
-
-            __syncthreads();
-
-            // Grab exclusive partial from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Identity-less exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial= Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Broadcast aggregate to all threads
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T exclusive_partial;
-            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_partial, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            output = block_prefix_callback_op(block_aggregate);
-            output = WarpScan(temp_storage.warp_scan).Broadcast(output, 0);
-
-            // Update prefix with exclusive warpscan partial
-            if (linear_tid > 0)
-                output = scan_op(output, exclusive_partial);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Broadcast aggregate to other lanes (through smem because we eventually want it in all threads)
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    ThreadStore<STORE_VOLATILE>(&temp_storage.block_aggregate, inclusive_partial);
-                block_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage.block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T prefix = block_prefix_callback_op(block_aggregate);
-                prefix = WarpScan(temp_storage.warp_scan).Broadcast(prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                if (linear_tid > 0)
-                    prefix = scan_op(prefix, exclusive_partial);
-
-                // Exclusive raking downsweep scan
-                ExclusiveDownsweep(scan_op, prefix);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Exclusive Warp-synchronous scan
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Broadcast aggregate to all threads
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    temp_storage.block_aggregate = inclusive_partial;
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        if (WARP_SYNCHRONOUS)
-        {
-            // Short-circuit directly to warp-synchronous scan
-            T inclusive_partial;
-            WarpScan(temp_storage.warp_scan).InclusiveScan(input, inclusive_partial, scan_op, block_aggregate);
-
-            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
-            output = block_prefix_callback_op(block_aggregate);
-            output = WarpScan(temp_storage.warp_scan).Broadcast(output, 0);
-
-            // Update prefix with exclusive warpscan partial
-            output = scan_op(output, inclusive_partial);
-        }
-        else
-        {
-            // Place thread partial into shared memory raking grid
-            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
-            *placement_ptr = input;
-
-            __syncthreads();
-
-            // Reduce parallelism down to just raking threads
-            if (linear_tid < RAKING_THREADS)
-            {
-                // Raking upsweep reduction across shared partials
-                T upsweep_partial = Upsweep(scan_op);
-
-                // Warp-synchronous scan
-                T inclusive_partial;
-                T exclusive_partial;
-                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
-
-                // Broadcast aggregate to other lanes (through smem because we eventually want it in all threads)
-                if (threadIdx.x == RAKING_THREADS - 1)
-                    ThreadStore<STORE_VOLATILE>(&temp_storage.block_aggregate, inclusive_partial);
-                block_aggregate = ThreadLoad<LOAD_VOLATILE>(&temp_storage.block_aggregate);
-
-                // Obtain block-wide prefix in lane0, then broadcast to other lanes
-                T prefix = block_prefix_callback_op(block_aggregate);
-                prefix = WarpScan(temp_storage.warp_scan).Broadcast(prefix, 0);
-
-                // Update prefix with warpscan exclusive partial
-                if (linear_tid > 0)
-                    prefix = scan_op(prefix, exclusive_partial);
-
-                // Inclusive raking downsweep scan
-                InclusiveDownsweep(scan_op, prefix);
-            }
-
-            __syncthreads();
-
-            // Grab thread prefix from shared memory
-            output = *placement_ptr;
-
-            // Retrieve block aggregate
-            block_aggregate = temp_storage.block_aggregate;
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh b/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
deleted file mode 100644
index 706ee1e96..000000000
--- a/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh
+++ /dev/null
@@ -1,379 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
- */
-
-#pragma once
-
-#include "../../util_arch.cuh"
-#include "../../util_ptx.cuh"
-#include "../../warp/warp_scan.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA threadblock.
- */
-template <
-    typename    T,
-    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
-    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
-    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
-    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
-struct BlockScanWarpScans
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    /// Constants
-    enum
-    {
-        /// Number of warp threads
-        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// The thread block size in threads
-        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
-
-        /// Number of active warps
-        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-    };
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
-
-    ///  WarpScan utility type
-    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
-
-    /// Shared memory storage layout type
-    struct _TempStorage
-    {
-        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
-        T                               warp_aggregates[WARPS];
-        T                               block_prefix;               ///< Shared prefix for the entire threadblock
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Thread fields
-    _TempStorage &temp_storage;
-    int linear_tid;
-    int warp_id;
-    int lane_id;
-
-
-    //---------------------------------------------------------------------
-    // Constructors
-    //---------------------------------------------------------------------
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanWarpScans(
-        TempStorage &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
-        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
-        lane_id(LaneId())
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    template <typename ScanOp, int WARP>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &partial,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        bool            lane_valid,         ///< [in] Whether or not the partial belonging to the current thread is valid
-        Int2Type<WARP>  addend_warp)
-    {
-        T inclusive = scan_op(block_aggregate, partial);
-        if (warp_id == WARP)
-        {
-            partial = (lane_valid) ?
-                inclusive :
-                block_aggregate;
-        }
-
-        T addend = temp_storage.warp_aggregates[WARP];
-        block_aggregate = scan_op(block_aggregate, addend);
-
-        ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<WARP + 1>());
-    }
-
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &partial,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        bool            lane_valid,         ///< [in] Whether or not the partial belonging to the current thread is valid
-        Int2Type<WARPS> addend_warp)
-    {}
-
-
-    /// Update the calling thread's partial reduction with the warp-wide aggregates from preceding warps.  Also returns block-wide aggregate in <em>thread</em><sub>0</sub>.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ApplyWarpAggregates(
-        T               &partial,           ///< [out] The calling thread's partial reduction
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
-        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
-        bool            lane_valid = true)  ///< [in] Whether or not the partial belonging to the current thread is valid
-    {
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
-
-        __syncthreads();
-
-        block_aggregate = temp_storage.warp_aggregates[0];
-
-        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
-        ApplyWarpAggregates(partial, scan_op, block_aggregate, lane_valid, Int2Type<1>());
-    }
-
-    //---------------------------------------------------------------------
-    // Exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input items
-        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
-        const T         &identity,          ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, identity, scan_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates
-        ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        T                       identity,                       ///< [in] Identity value
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        ExclusiveScan(input, output, identity, scan_op, block_aggregate);
-
-        // Use the first warp to determine the threadblock prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        output = scan_op(block_prefix, output);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Identity-less exclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        ExclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no identity value, the output computed for <em>thread</em><sub>0</sub> is undefined.
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        T inclusive_output;
-        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, output, scan_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates
-        ApplyWarpAggregates(output, scan_op, inclusive_output, block_aggregate, (lane_id > 0));
-    }
-
-
-    /// Computes an exclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        ExclusiveScan(input, output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the threadblock prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        output = (linear_tid == 0) ?
-            block_prefix :
-            scan_op(block_prefix, output);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive scans
-    //---------------------------------------------------------------------
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op)                        ///< [in] Binary scan operator
-    {
-        T block_aggregate;
-        InclusiveScan(input, output, scan_op, block_aggregate);
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,                          ///< [in] Calling thread's input item
-        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp          scan_op,                        ///< [in] Binary scan operator
-        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
-    {
-        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, output, scan_op);
-
-        // Update outputs and block_aggregate with warp-wide aggregates from lane WARP_THREADS-1
-        ApplyWarpAggregates(output, scan_op, output, block_aggregate);
-
-    }
-
-
-    /// Computes an inclusive threadblock-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the threadblock's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
-    template <
-        typename ScanOp,
-        typename BlockPrefixCallbackOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,                          ///< [in] Calling thread's input item
-        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
-        ScanOp                  scan_op,                        ///< [in] Binary scan operator
-        T                       &block_aggregate,               ///< [out] Threadblock-wide aggregate reduction of input items (exclusive of the \p block_prefix_callback_op value)
-        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a threadblock-wide prefix to be applied to all inputs.
-    {
-        InclusiveScan(input, output, scan_op, block_aggregate);
-
-        // Use the first warp to determine the threadblock prefix, returning the result in lane0
-        if (warp_id == 0)
-        {
-            T block_prefix = block_prefix_callback_op(block_aggregate);
-            if (lane_id == 0)
-            {
-                // Share the prefix with all threads
-                temp_storage.block_prefix = block_prefix;
-            }
-        }
-
-        __syncthreads();
-
-        // Incorporate threadblock prefix into outputs
-        T block_prefix = temp_storage.block_prefix;
-        output = scan_op(block_prefix, output);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh
deleted file mode 100644
index 3ad884c1c..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_histo.cuh
+++ /dev/null
@@ -1,319 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "specializations/block_range_histo_gatomic.cuh"
-#include "specializations/block_range_histo_satomic.cuh"
-#include "specializations/block_range_histo_sort.cuh"
-#include "../util_type.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-
-/**
- * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockRangeHistogram.
- */
-enum DeviceHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
-     * -# A single thread block in the second kernel reduces them into the output histogram(s).
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    DEVICE_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using shared-memory \p atomicAdd().
-     * -# A single thread block in the second kernel reduces them into the
-     *    output histogram(s).
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    DEVICE_HISTO_SHARED_ATOMIC,
-
-
-    /**
-     * \par Overview
-     * A single-kernel approach in which thread blocks update the output histogram(s) directly
-     * using global-memory \p atomicAdd().
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * Performance is not significantly impacted when computing histograms having large
-     * numbers of bins (e.g., thousands).
-     */
-    DEVICE_HISTO_GLOBAL_ATOMIC,
-
-};
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeHistogram
- */
-template <
-    int                             _BLOCK_THREADS,         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    DeviceHistogramAlgorithm        _HISTO_ALGORITHM,       ///< Cooperative histogram algorithm to use
-    GridMappingStrategy             _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
-struct BlockRangeHistogramPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const DeviceHistogramAlgorithm   HISTO_ALGORITHM     = _HISTO_ALGORITHM;     ///< Cooperative histogram algorithm to use
-    static const GridMappingStrategy        GRID_MAPPING        = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
- */
-template <
-    typename    BlockRangeHistogramPolicy,      ///< Parameterized BlockRangeHistogramPolicy tuning policy type
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                  ///< Random-access input iterator type for reading samples.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                         ///< Signed integer type for global offsets
-struct BlockRangeHistogram
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Histogram grid algorithm
-    static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockRangeHistogramPolicy::HISTO_ALGORITHM;
-
-    // Alternative internal implementation types
-    typedef BlockRangeHistogramSort<            BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSortT;
-    typedef BlockRangeHistogramSharedAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramSharedAtomicT;
-    typedef BlockRangeHistogramGlobalAtomic<    BlockRangeHistogramPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockRangeHistogramGlobalAtomicT;
-
-    // Internal block sweep histogram type
-    typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
-        BlockRangeHistogramSortT,
-        typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
-            BlockRangeHistogramSharedAtomicT,
-            BlockRangeHistogramGlobalAtomicT>::Type>::Type InternalBlockDelegate;
-
-    enum
-    {
-        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
-    };
-
-
-    // Temporary storage type
-    typedef typename InternalBlockDelegate::TempStorage TempStorage;
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Internal block delegate
-    InternalBlockDelegate internal_delegate;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeHistogram(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        internal_delegate(temp_storage, d_in, d_out_histograms)
-    {}
-
-
-    /**
-     * \brief Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset   block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                              num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        even_share.BlockInit();
-        ConsumeRange(even_share.block_offset, even_share.block_end);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<Offset>   queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Shared block offset
-        __shared__ Offset shared_block_offset;
-
-        // We give each thread block at least one tile of input.
-        Offset block_offset      = blockIdx.x * TILE_ITEMS;
-        Offset even_share_base   = gridDim.x * TILE_ITEMS;
-
-        // Process full tiles of input
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-
-            // Dequeue up to TILE_ITEMS
-            if (threadIdx.x == 0)
-                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-            __syncthreads();
-
-            block_offset = shared_block_offset;
-
-            __syncthreads();
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                          num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeRange(num_items, queue);
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh
deleted file mode 100644
index 50546a5b7..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_downsweep.cuh
+++ /dev/null
@@ -1,736 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
- */
-
-
-#pragma once
-
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_radix_rank.cuh"
-#include "../block/block_exchange.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Types of scattering strategies
- */
-enum RadixSortScatterAlgorithm
-{
-    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
-    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
-};
-
-
-/**
- * Parameterizable tuning policy type for BlockRangeRadixSortDownsweep
- */
-template <
-    int                         _BLOCK_THREADS,             ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,             ///< Cache load modifier for reading keys (and values)
-    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
-    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
-    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The BlockScan algorithm algorithm to use
-    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
-    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode
-    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
-struct BlockRangeRadixSortDownsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
-        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,   ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,      ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
-    };
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier          LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading keys (and values)
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;    ///< The BlockScan algorithm algorithm to use
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;       ///< The scattering strategy to use
-    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;             ///< Shared memory bank mode
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
- */
-template <
-    typename BlockRangeRadixSortDownsweepPolicy,        ///< Parameterized BlockRangeRadixSortDownsweepPolicy tuning policy type
-    bool     DESCENDING,                                   ///< Whether or not the sorted-order is high-to-low
-    typename Key,                                       ///< Key type
-    typename Value,                                     ///< Value type
-    typename Offset>                                    ///< Signed integer type for global offsets
-struct BlockRangeRadixSortDownsweep
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of Key
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
-    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRangeRadixSortDownsweepPolicy::LOAD_ALGORITHM;
-    static const CacheLoadModifier          LOAD_MODIFIER           = BlockRangeRadixSortDownsweepPolicy::LOAD_MODIFIER;
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRangeRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRangeRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
-    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRangeRadixSortDownsweepPolicy::SMEM_CONFIG;
-
-    enum
-    {
-        BLOCK_THREADS           = BlockRangeRadixSortDownsweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = BlockRangeRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING   = BlockRangeRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING,
-        RADIX_BITS              = BlockRangeRadixSortDownsweepPolicy::RADIX_BITS,
-        MEMOIZE_OUTER_SCAN      = BlockRangeRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
-
-        WARP_THREADS            = CUB_PTX_LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_SIZET         = sizeof(Offset),
-        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
-
-        LOG_SMEM_BANKS          = CUB_PTX_LOG_SMEM_BANKS,
-        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
-
-        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
-        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
-
-        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
-        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
-    };
-
-    // Input iterator wrapper types
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, Value, Offset>         ValuesItr;
-
-    // BlockRadixRank type
-    typedef BlockRadixRank<
-        BLOCK_THREADS,
-        RADIX_BITS,
-        DESCENDING,
-        MEMOIZE_OUTER_SCAN,
-        INNER_SCAN_ALGORITHM,
-        SMEM_CONFIG> BlockRadixRank;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        KeysItr,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        EXCHANGE_TIME_SLICING> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValuesItr,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        EXCHANGE_TIME_SLICING> BlockLoadValues;
-
-    // BlockExchange type (keys)
-    typedef BlockExchange<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
-
-    // BlockExchange type (values)
-    typedef BlockExchange<
-        Value,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeValues;
-
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        Offset  relative_bin_offsets[RADIX_DIGITS + 1];
-        bool    short_circuit;
-
-        union
-        {
-            typename BlockRadixRank::TempStorage        ranking;
-            typename BlockLoadKeys::TempStorage         load_keys;
-            typename BlockLoadValues::TempStorage       load_values;
-            typename BlockExchangeKeys::TempStorage     exchange_keys;
-            typename BlockExchangeValues::TempStorage   exchange_values;
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-    ValuesItr       d_values_in;
-    UnsignedBits    *d_keys_out;
-    Value           *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    Offset          bin_offset;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Whether to short-ciruit
-    bool            short_circuit;
-
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decodes given keys to lookup digit offsets in shared memory
-     */
-    __device__ __forceinline__ void DecodeRelativeBinOffsets(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset          (&relative_bin_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, RADIX_BITS);
-
-            // Lookup base digit offset from shared memory
-            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
-        }
-    }
-
-
-    /**
-     * Scatter ranked items to global memory
-     */
-    template <bool FULL_TILE, typename T>
-    __device__ __forceinline__ void ScatterItems(
-        T       (&items)[ITEMS_PER_THREAD],
-        int     (&local_ranks)[ITEMS_PER_THREAD],
-        Offset  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        T       *d_out,
-        Offset  valid_items)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Scatter if not out-of-bounds
-            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
-            {
-                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked keys directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Compute scatter offsets
-        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
-
-        // Untwiddle keys before outputting
-        UnsignedBits keys[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
-        }
-
-        // Scatter to global
-        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        // Exchange keys through shared memory
-        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterKeys<FULL_TILE>(
-            twiddled_keys,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Scatter ranked values directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Scatter to global
-        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        __syncthreads();
-
-        // Exchange keys through shared memory
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile)
-     */
-    template <typename BlockLoadT, typename T, typename InputIterator>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        InputIterator   d_in,
-        Offset          valid_items,
-        Int2Type<true>  is_full_tile)
-    {
-        block_loader.Load(d_in, items);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile)
-     */
-    template <typename BlockLoadT, typename T, typename InputIterator>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        InputIterator   d_in,
-        Offset          valid_items,
-        Int2Type<false> is_full_tile)
-    {
-        block_loader.Load(d_in, items, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE, typename _Value>
-    __device__ __forceinline__ void GatherScatterValues(
-        _Value      (&values)[ITEMS_PER_THREAD],
-        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        Offset      block_offset,
-        Offset      valid_items)
-    {
-        __syncthreads();
-
-        BlockLoadValues loader(temp_storage.load_values);
-        LoadItems(
-            loader,
-            values,
-            d_values_in + block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items,
-            Int2Type<SCATTER_ALGORITHM>());
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        NullType    (&values)[ITEMS_PER_THREAD],
-        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        Offset      block_offset,
-        Offset      valid_items)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        Offset block_offset,
-        const Offset &valid_items = TILE_ITEMS)
-    {
-        // Per-thread tile data
-        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
-        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
-        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
-        Offset          relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
-
-        // Assign max-key to all keys
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY;
-        }
-
-        // Load tile of keys
-        BlockLoadKeys loader(temp_storage.load_keys);
-        LoadItems(
-            loader,
-            keys,
-            d_keys_in + block_offset,
-            valid_items, 
-            Int2Type<FULL_TILE>());
-
-        __syncthreads();
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int inclusive_digit_prefix;
-        BlockRadixRank(temp_storage.ranking).RankKeys(
-            twiddled_keys,
-            ranks,
-            current_bit,
-            inclusive_digit_prefix);
-
-        // Update global scatter base offsets for each digit
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
-        {
-            int exclusive_digit_prefix;
-
-            // Get exclusive digit prefix from inclusive prefix
-            if (DESCENDING)
-            {
-                // Get the prefix from the next thread (higher bins come first)
-#if CUB_PTX_ARCH >= 300
-                exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1);
-                if (threadIdx.x == RADIX_DIGITS - 1)
-                    exclusive_digit_prefix = 0;
-#else
-                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
-                exchange[threadIdx.x + 1] = 0;
-                exchange[threadIdx.x] = inclusive_digit_prefix;
-                exclusive_digit_prefix = exchange[threadIdx.x + 1];
-#endif
-            }
-            else
-            {
-                // Get the prefix from the previous thread (lower bins come first)
-#if CUB_PTX_ARCH >= 300
-                exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
-                if (threadIdx.x == 0)
-                    exclusive_digit_prefix = 0;
-#else
-                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
-                exchange[threadIdx.x] = 0;
-                exchange[threadIdx.x + 1] = inclusive_digit_prefix;
-                exclusive_digit_prefix = exchange[threadIdx.x];
-#endif
-            }
-
-            bin_offset -= exclusive_digit_prefix;
-            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
-            bin_offset += inclusive_digit_prefix;
-        }
-
-        __syncthreads();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
-
-        // Gather/scatter values
-        Value values[ITEMS_PER_THREAD];
-        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
-    }
-
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <
-        typename InputIterator,
-        typename T>
-    __device__ __forceinline__ void Copy(
-        InputIterator   d_in,
-        T               *d_out,
-        Offset          block_offset,
-        Offset          block_end)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            __syncthreads();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_end)
-        {
-            Offset valid_items = block_end - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            __syncthreads();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    template <typename InputIterator>
-    __device__ __forceinline__ void Copy(
-        InputIterator   d_in,
-        NullType        *d_out,
-        Offset          block_offset,
-        Offset          block_end)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeRadixSortDownsweep(
-        TempStorage &temp_storage,
-        Offset       bin_offset,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        bin_offset(bin_offset),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        short_circuit(false)
-    {}
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeRadixSortDownsweep(
-        TempStorage &temp_storage,
-        Offset      num_items,
-        Offset      *d_spine,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit)
-    {
-        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            int bin_idx = (DESCENDING) ?
-                RADIX_DIGITS - threadIdx.x - 1 :
-                threadIdx.x;
-
-            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-            Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx];
-            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-            this->temp_storage.short_circuit = WarpAll(predicate);
-
-            // Load my block's bin offset for my bin
-            bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
-        }
-
-        __syncthreads();
-
-        short_circuit = this->temp_storage.short_circuit;
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        Offset          block_offset,
-        const Offset    &block_end)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_end);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_end);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-
-                __syncthreads();
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_end)
-            {
-                ProcessTile<false>(block_offset, block_end - block_offset);
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh
deleted file mode 100644
index efb2f7bd3..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_radix_sort_upsweep.cuh
+++ /dev/null
@@ -1,443 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeRadixSortUpsweep
- */
-template <
-    int                 _BLOCK_THREADS,     ///< Threads per thread block
-    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
-    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct BlockRangeRadixSortUpsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
- */
-template <
-    typename BlockRangeRadixSortUpsweepPolicy,      ///< Parameterized BlockRangeRadixSortUpsweepPolicy tuning policy type
-    typename Key,                                   ///< Key type
-    typename Offset>                                ///< Signed integer type for global offsets
-struct BlockRangeRadixSortUpsweep
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const CacheLoadModifier LOAD_MODIFIER = BlockRangeRadixSortUpsweepPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = BlockRangeRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = BlockRangeRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = BlockRangeRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-    // Input iterator wrapper types
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        union
-        {
-            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
-            Offset          digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    Offset          local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            BlockRangeRadixSortUpsweep     &cta,
-            UnsignedBits                    keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(BlockRangeRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
-
-        // Add in sub-counter offset
-        UnsignedBits sub_counter = BFE(converted_key, current_bit, LOG_PACKING_RATIO);
-
-        // Add in row offset
-        UnsignedBits row_offset = BFE(converted_key, current_bit + LOG_PACKING_RATIO, LOG_COUNTER_LANES);
-
-        // Increment counter
-        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
-
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Places unpacked counters into smem for final digit reduction
-     */
-    __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count)
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        __syncthreads();
-
-        // Rake-reduce bin_count reductions
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            bin_count = ThreadReduce<WARP_THREADS>(
-                temp_storage.digit_partials[threadIdx.x],
-                Sum());
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(Offset block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-//        __threadfence_block();
-//        __syncthreads();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        Offset block_offset,
-        const Offset &block_end)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_end)
-        {
-            // Load and bucket key
-            UnsignedBits key = d_keys_in[block_offset];
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeRadixSortUpsweep(
-        TempStorage &temp_storage,
-        Key         *d_keys_in,
-        int         current_bit)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        Offset           block_offset,
-        const Offset     &block_end,
-        Offset           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_end)
-        {
-            for (int i = 0; i < UNROLL_COUNT; ++i)
-            {
-                ProcessFullTile(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            __syncthreads();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            __syncthreads();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_end);
-
-        __syncthreads();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-
-        __syncthreads();
-
-        // Final raking reduction of counts by bin
-        ReduceUnpackedCounts(bin_count);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh
deleted file mode 100644
index 9e97f87bc..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_reduce.cuh
+++ /dev/null
@@ -1,430 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeReduce
- */
-template <
-    int                     _BLOCK_THREADS,         ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,         ///< Cache load modifier for reading input elements
-    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
-struct BlockRangeReducePolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename BlockRangeReducePolicy,        ///< Parameterized BlockRangeReducePolicy tuning policy type
-    typename InputIterator,                 ///< Random-access iterator type for input
-    typename Offset,                        ///< Signed integer type for global offsets
-    typename ReductionOp>                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct BlockRangeReduce
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The value type of the input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Vector type of T for data movement
-    typedef typename CubVector<T, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, T, Offset>,  // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                            // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockRangeReducePolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeReducePolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, BlockRangeReducePolicy::VECTOR_LOAD_LENGTH),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        CAN_VECTORIZE       = (VECTOR_LOAD_LENGTH > 1) &&
-                                (IsPointer<InputIterator>::VALUE) &&
-                                Traits<T>::PRIMITIVE,
-
-    };
-
-    static const CacheLoadModifier    LOAD_MODIFIER   = BlockRangeReducePolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockRangeReducePolicy::BLOCK_ALGORITHM;
-
-    // Parameterized BlockReduce primitive
-    typedef BlockReduce<T, BLOCK_THREADS, BlockRangeReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    typedef typename BlockReduceT::TempStorage _TempStorage;
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    T                       thread_aggregate;   ///< Each thread's partial reduction
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIterator           d_in;               ///< Input data to reduce
-    WrappedInputIterator    d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-    int                     first_tile_size;    ///< Size of first tile consumed
-    bool                    is_aligned;         ///< Whether or not input is vector-aligned
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  can_vectorize)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-    }
-
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<false> can_vectorize)
-    {
-        return false;
-    }
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeReduce(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIterator           d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op),
-        first_tile_size(0),
-        is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
-    {}
-
-
-    /**
-     * Consume a full tile of input (specialized for cases where we cannot vectorize)
-     */
-    template <typename _Offset>
-    __device__ __forceinline__ T ConsumeFullTile(
-        _Offset             block_offset,            ///< The offset the tile to consume
-        Int2Type<false>     can_vectorize)           ///< Whether or not we can vectorize loads
-    {
-        T items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        return ThreadReduce(items, reduction_op);
-    }
-
-
-    /**
-     * Consume a full tile of input (specialized for cases where we can vectorize)
-     */
-    template <typename _Offset>
-    __device__ __forceinline__ T ConsumeFullTile(
-        _Offset             block_offset,            ///< The offset the tile to consume
-        Int2Type<true>      can_vectorize)           ///< Whether or not we can vectorize loads
-    {
-        if (!is_aligned)
-        {
-            // Not aligned
-            return ConsumeFullTile(block_offset, Int2Type<false>());
-        }
-        else
-        {
-            // Alias items as an array of VectorT and load it in striped fashion
-            enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-            T items[ITEMS_PER_THREAD];
-
-            VectorT *vec_items = reinterpret_cast<VectorT*>(items);
-
-            // Vector input iterator wrapper type
-            CacheModifiedInputIterator<BlockRangeReducePolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
-                reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
-
-            #pragma unroll
-            for (int i = 0; i < WORDS; ++i)
-                vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-            // Reduce items within each thread stripe
-            return ThreadReduce(items, reduction_op);
-        }
-    }
-
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset  block_offset,                   ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile
-            T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
-
-            // Update running thread aggregate
-            thread_aggregate = (first_tile_size) ?
-                reduction_op(thread_aggregate, partial) :       // Update
-                partial;                                        // Assign
-        }
-        else
-        {
-            // Partial tile
-            int thread_offset = threadIdx.x;
-
-            if (!first_tile_size && (thread_offset < valid_items))
-            {
-                // Assign thread_aggregate
-                thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-                thread_offset += BLOCK_THREADS;
-            }
-
-            while (thread_offset < valid_items)
-            {
-                // Update thread aggregate
-                T item = d_wrapped_in[block_offset + thread_offset];
-                thread_aggregate = reduction_op(thread_aggregate, item);
-                thread_offset += BLOCK_THREADS;
-            }
-        }
-
-        // Set first tile size if necessary
-        if (!first_tile_size)
-            first_tile_size = valid_items;
-    }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
-        T       &block_aggregate)                   ///< [out] Running total
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                              num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
-        T                                   &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        // Initialize even-share descriptor for this thread block
-        even_share.BlockInit();
-
-        // Consume input tiles
-        ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<Offset>   queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        T                   &block_aggregate)   ///< [out] Running total
-    {
-        // Shared dequeue offset
-        __shared__ Offset dequeue_offset;
-
-        // We give each thread block at least one tile of input.
-        Offset block_offset = blockIdx.x * TILE_ITEMS;
-        Offset even_share_base = gridDim.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-        {
-            // Consume full tile of input
-            ConsumeTile<true>(block_offset);
-
-            // Dequeue more tiles
-            while (true)
-            {
-                 // Dequeue a tile of items
-                if (threadIdx.x == 0)
-                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-                __syncthreads();
-
-                // Grab tile offset and check if we're done with full tiles
-                block_offset = dequeue_offset;
-
-                __syncthreads();
-
-                if (block_offset + TILE_ITEMS > num_items)
-                    break;
-
-                // Consume a full tile
-                ConsumeTile<true>(block_offset);
-            }
-        }
-
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                          num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
-        T                               &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeRange(num_items, queue, block_aggregate);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh
deleted file mode 100644
index f56baaa0e..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_reduce_by_key.cuh
+++ /dev/null
@@ -1,1034 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeReduceByKey
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _TWO_PHASE_SCATTER,             ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockRangeReduceByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        TWO_PHASE_SCATTER       = _TWO_PHASE_SCATTER,           ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Tile status interface types
- ******************************************************************************/
-
-/**
- * Tile status interface for reduction by key.
- *
- */
-template <
-    typename    Value,
-    typename    Offset,
-    bool        SINGLE_WORD = (Traits<Value>::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)>
-struct ReduceByKeyScanTileState;
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <
-    typename    Value,
-    typename    Offset>
-struct ReduceByKeyScanTileState<Value, Offset, false> :
-    ScanTileState<ItemOffsetPair<Value, Offset> >
-{
-    typedef ScanTileState<ItemOffsetPair<Value, Offset> > SuperClass;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState() : SuperClass() {}
-};
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * can be combined into one machine word that can be read/written coherently in a single access.
- */
-template <
-    typename Value,
-    typename Offset>
-struct ReduceByKeyScanTileState<Value, Offset, true>
-{
-    typedef ItemOffsetPair<Value, Offset> ItemOffsetPair;
-
-    // Constants
-    enum
-    {
-        PAIR_SIZE           = sizeof(Value) + sizeof(Offset),
-        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
-        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
-
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Status word type
-    typedef typename If<(STATUS_WORD_SIZE == 8),
-        long long,
-        typename If<(STATUS_WORD_SIZE == 4),
-            int,
-            typename If<(STATUS_WORD_SIZE == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Status word type
-    typedef typename If<(TXN_WORD_SIZE == 16),
-        longlong2,
-        typename If<(TXN_WORD_SIZE == 8),
-            long long,
-            int>::Type>::Type TxnWord;
-
-    // Device word type (for when sizeof(Value) == sizeof(Offset))
-    struct TileDescriptorBigStatus
-    {
-        Offset      offset;
-        Value       value;
-        StatusWord  status;
-    };
-
-    // Device word type (for when sizeof(Value) != sizeof(Offset))
-    struct TileDescriptorLittleStatus
-    {
-        Value       value;
-        StatusWord  status;
-        Offset      offset;
-    };
-
-    // Device word type
-    typedef typename If<
-            (sizeof(Value) == sizeof(Offset)),
-            TileDescriptorBigStatus,
-            TileDescriptorLittleStatus>::Type
-        TileDescriptor;
-
-
-    // Device storage
-    TileDescriptor *d_tile_status;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState()
-    :
-        d_tile_status(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, ItemOffsetPair tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive.value;
-        tile_descriptor.offset = tile_inclusive.offset;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, ItemOffsetPair tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial.value;
-        tile_descriptor.offset = tile_partial.offset;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        ItemOffsetPair  &value)
-    {
-        // Use warp-any to determine when all threads have valid status
-        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        while ((tile_descriptor.status == SCAN_TILE_INVALID))
-        {
-            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-        }
-
-        status = tile_descriptor.status;
-        value.value = tile_descriptor.value;
-        value.offset = tile_descriptor.offset;
-    }
-
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles
- */
-template <
-    typename    BlockRangeReduceByKeyPolicy,    ///< Parameterized BlockRangeReduceByKeyPolicy tuning policy type
-    typename    KeyInputIterator,               ///< Random-access input iterator type for keys
-    typename    KeyOutputIterator,              ///< Random-access output iterator type for keys
-    typename    ValueInputIterator,             ///< Random-access input iterator type for values
-    typename    ValueOutputIterator,            ///< Random-access output iterator type for values
-    typename    EqualityOp,                     ///< Key equality operator type
-    typename    ReductionOp,                    ///< Value reduction operator type
-    typename    Offset>                         ///< Signed integer type for global offsets
-struct BlockRangeReduceByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of key iterator
-    typedef typename std::iterator_traits<KeyInputIterator>::value_type Key;
-
-    // Data type of value iterator
-    typedef typename std::iterator_traits<ValueInputIterator>::value_type Value;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
-        WARPS               = BLOCK_THREADS / CUB_PTX_WARP_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (BlockRangeReduceByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO       = (Equals<ReductionOp, cub::Sum>::VALUE) && (Traits<Value>::PRIMITIVE),
-
-        // Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        // Whether or not this is run-length-encoding with a constant iterator as values
-        IS_RUN_LENGTH_ENCODE    = (Equals<ValueInputIterator, ConstantInputIterator<Value, size_t> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, int> >::VALUE) || (Equals<ValueInputIterator, ConstantInputIterator<Value, unsigned int> >::VALUE),
-
-    };
-
-    // Cache-modified input iterator wrapper type for keys
-    typedef typename If<IsPointer<KeyInputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Key, Offset>,   // Wrap the native input pointer with CacheModifiedValueInputIterator
-            KeyInputIterator>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedKeyInputIterator;
-
-    // Cache-modified input iterator wrapper type for values
-    typedef typename If<IsPointer<ValueInputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeReduceByKeyPolicy::LOAD_MODIFIER, Value, Offset>,  // Wrap the native input pointer with CacheModifiedValueInputIterator
-            ValueInputIterator>::Type                                                                // Directly use the supplied input iterator type
-        WrappedValueInputIterator;
-
-    // Value-offset tuple type for scanning (maps accumulated values to segment index)
-    typedef ItemOffsetPair<Value, Offset> ValueOffsetPair;
-
-    // Reduce-value-by-segment scan operator
-    struct ReduceByKeyOp
-    {
-        ReductionOp op;                 ///< Wrapped reduction operator
-
-        /// Constructor
-        __device__ __forceinline__ ReduceByKeyOp(ReductionOp op) : op(op) {}
-
-        /// Scan operator (specialized for sum on primitive types)
-        __device__ __forceinline__ ValueOffsetPair operator()(
-            const ValueOffsetPair   &first,             ///< First partial reduction
-            const ValueOffsetPair   &second,            ///< Second partial reduction
-            Int2Type<true>          has_identity_zero)  ///< Whether the operation has a zero-valued identity
-        {
-            Value select = (second.offset) ? 0 : first.value;
-
-            ValueOffsetPair retval;
-            retval.offset = first.offset + second.offset;
-            retval.value = op(select, second.value);
-            return retval;
-        }
-
-        /// Scan operator (specialized for reductions without zero-valued identity)
-        __device__ __forceinline__ ValueOffsetPair operator()(
-            const ValueOffsetPair   &first,             ///< First partial reduction
-            const ValueOffsetPair   &second,            ///< Second partial reduction
-            Int2Type<false>         has_identity_zero)  ///< Whether the operation has a zero-valued identity
-        {
-#if (__CUDA_ARCH__ > 130)
-            // This expression uses less registers and is faster when compiled with nvvm
-            ValueOffsetPair retval;
-            retval.offset = first.offset + second.offset;
-            if (second.offset)
-            {
-                retval.value = second.value;
-                return retval;
-            }
-            else
-            {
-                retval.value = op(first.value, second.value);
-                return retval;
-            }
-#else
-            // This expression uses less registers and is faster when compiled with Open64
-            ValueOffsetPair retval;
-            retval.offset = first.offset + second.offset;
-            retval.value = (second.offset) ?
-                    second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                    op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
-            return retval;
-#endif
-        }
-
-        /// Scan operator
-        __device__ __forceinline__ ValueOffsetPair operator()(
-            const ValueOffsetPair &first,       ///< First partial reduction
-            const ValueOffsetPair &second)      ///< Second partial reduction
-        {
-            return (*this)(first, second, Int2Type<HAS_IDENTITY_ZERO>());
-        }
-    };
-
-    // Parameterized BlockLoad type for keys
-    typedef BlockLoad<
-            WrappedKeyInputIterator,
-            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
-            BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
-            BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
-        BlockLoadKeys;
-
-    // Parameterized BlockLoad type for values
-    typedef BlockLoad<
-            WrappedValueInputIterator,
-            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
-            BlockRangeReduceByKeyPolicy::ITEMS_PER_THREAD,
-            (IS_RUN_LENGTH_ENCODE) ?
-                BLOCK_LOAD_DIRECT :
-                (BlockLoadAlgorithm) BlockRangeReduceByKeyPolicy::LOAD_ALGORITHM>
-        BlockLoadValues;
-
-    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
-    typedef BlockExchange<
-            Key,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeKeys;
-
-    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
-    typedef BlockExchange<
-            Value,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeValues;
-
-    // Parameterized BlockDiscontinuity type for keys
-    typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            ValueOffsetPair,
-            BlockRangeReduceByKeyPolicy::BLOCK_THREADS,
-            BlockRangeReduceByKeyPolicy::SCAN_ALGORITHM>
-        BlockScanAllocations;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            ValueOffsetPair,
-            ReduceByKeyOp,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-
-        union
-        {
-            struct
-            {
-                typename BlockScanAllocations::TempStorage      scan;           // Smem needed for tile scanning
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;         // Smem needed for cooperative prefix callback
-                typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
-                typename BlockLoadKeys::TempStorage             load_keys;      // Smem needed for loading keys
-
-                Offset      tile_idx;               // Shared tile index
-                Offset      tile_num_flags_prefix;  // Exclusive tile prefix
-            };
-
-            // Smem needed for loading values
-            typename BlockLoadValues::TempStorage load_values;
-
-            // Smem needed for compacting values
-            typename BlockExchangeValues::TempStorage exchange_values;
-
-            // Smem needed for compacting keys
-            typename BlockExchangeKeys::TempStorage exchange_keys;
-        };
-
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
-
-    WrappedKeyInputIterator         d_keys_in;          ///< Input keys
-    KeyOutputIterator               d_keys_out;         ///< Output keys
-
-    WrappedValueInputIterator       d_values_in;        ///< Input values
-    ValueOutputIterator             d_values_out;       ///< Output values
-
-    InequalityWrapper<EqualityOp>   inequality_op;      ///< Key inequality operator
-    ReduceByKeyOp                   scan_op;            ///< Reduce-value-by flag scan operator
-    Offset                          num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockRangeReduceByKey(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        KeyInputIterator            d_keys_in,          ///< Input keys
-        KeyOutputIterator           d_keys_out,         ///< Output keys
-        ValueInputIterator          d_values_in,        ///< Input values
-        ValueOutputIterator         d_values_out,       ///< Output values
-        EqualityOp                  equality_op,        ///< Key equality operator
-        ReductionOp                 reduction_op,       ///< Value reduction operator
-        Offset                      num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_keys_out(d_keys_out),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        inequality_op(equality_op),
-        scan_op(reduction_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan with identity (first tile)
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        ValueOffsetPair     &block_aggregate,
-        Int2Type<true>      has_identity)
-    {
-        ValueOffsetPair identity;
-        identity.value = 0;
-        identity.offset = 0;
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate);
-    }
-
-    /**
-     * Scan without identity (first tile).  Without an identity, the first output item is undefined.
-     *
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        ValueOffsetPair     &block_aggregate,
-        Int2Type<false>     has_identity)
-    {
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate);
-    }
-
-    /**
-     * Scan with identity (subsequent tile)
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        ValueOffsetPair             &block_aggregate,
-        LookbackPrefixCallbackOp    &prefix_op,
-        Int2Type<true>              has_identity)
-    {
-        ValueOffsetPair identity;
-        identity.value = 0;
-        identity.offset = 0;
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Scan without identity (subsequent tile).  Without an identity, the first output item is undefined.
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        ValueOffsetPair             &block_aggregate,
-        LookbackPrefixCallbackOp    &prefix_op,
-        Int2Type<false>             has_identity)
-    {
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Zip utility methods
-    //---------------------------------------------------------------------
-
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void ZipValuesAndFlags(
-        Offset          num_remaining,
-        Value           (&values)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD])
-    {
-        // Zip values and flags
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Unset flags for out-of-bounds keys
-            if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining))
-                flags[ITEM] = 0;
-
-            values_and_segments[ITEM].value      = values[ITEM];
-            values_and_segments[ITEM].offset     = flags[ITEM];
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for direct scattering)
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate. As such:
-     * - The scatter offsets must be decremented for value value aggregates
-     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
-     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
-     *
-     */
-    template <bool LAST_TILE, bool FIRST_TILE, int ITEM>
-    __device__ __forceinline__ void ScatterDirect(
-        Offset              num_remaining,
-        Key                 (&keys)[ITEMS_PER_THREAD],
-        ValueOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset              (&flags)[ITEMS_PER_THREAD],
-        Offset              tile_num_flags,
-        Int2Type<ITEM>      iteration)
-    {
-        // Scatter key
-        if (flags[ITEM])
-        {
-            d_keys_out[values_and_segments[ITEM].offset] = keys[ITEM];
-        }
-
-        bool is_first_flag     = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0);
-        bool is_oob_value      = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining);
-
-        // Scatter value reduction
-        if (((flags[ITEM] || is_oob_value)) && (!is_first_flag))
-        {
-            d_values_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value;
-        }
-
-        ScatterDirect<LAST_TILE, FIRST_TILE>(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type<ITEM + 1>());
-    }
-
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        Offset                      num_remaining,
-        Key                         (&keys)[ITEMS_PER_THREAD],
-        ValueOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset                      (&flags)[ITEMS_PER_THREAD],
-        Offset                      tile_num_flags,
-        Int2Type<ITEMS_PER_THREAD>  iteration)
-    {}
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate. As such:
-     * - The scatter offsets must be decremented for value value aggregates
-     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
-     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
-     *
-     */
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset          num_remaining,
-        Key             (&keys)[ITEMS_PER_THREAD],
-        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        Offset          tile_num_flags,
-        Offset          tile_num_flags_prefix)
-    {
-        int     local_ranks[ITEMS_PER_THREAD];
-        Value   values[ITEMS_PER_THREAD];
-
-        // Share exclusive tile prefix
-        if (threadIdx.x == 0)
-        {
-            temp_storage.tile_num_flags_prefix = tile_num_flags_prefix;
-        }
-
-        __syncthreads();
-
-        // Load exclusive tile prefix in all threads
-        tile_num_flags_prefix = temp_storage.tile_num_flags_prefix;
-
-        __syncthreads();
-
-        // Compute local scatter ranks
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix;
-        }
-
-        // Compact keys in shared memory
-        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags);
-
-        // Scatter keys
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_out + tile_num_flags_prefix, keys, tile_num_flags);
-
-        // Unzip values and set flag for first oob item in last tile
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            values[ITEM] = values_and_segments[ITEM].value;
-
-            if (FIRST_TILE)
-                local_ranks[ITEM]--;
-
-            if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining))
-                flags[ITEM] = 1;
-        }
-
-        // Unset first flag in first tile
-        if (FIRST_TILE && (threadIdx.x == 0))
-            flags[0] = 0;
-
-        __syncthreads();
-
-        // Compact values in shared memory
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags);
-
-        // Number to output
-        Offset exchange_count = tile_num_flags;
-
-        if (LAST_TILE && (num_remaining < TILE_ITEMS))
-            exchange_count++;
-
-        if (FIRST_TILE)
-        {
-            exchange_count--;
-        }
-        else
-        {
-            tile_num_flags_prefix--;
-        }
-
-        // Scatter values
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_values_out + tile_num_flags_prefix, values, exchange_count);
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          num_remaining,
-        Key             (&keys)[ITEMS_PER_THREAD],
-        ValueOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        Offset          tile_num_flags,
-        Offset          tile_num_flags_prefix)
-    {
-        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
-        if ((TWO_PHASE_SCATTER) && ((tile_num_flags >> Log2<BLOCK_THREADS>::VALUE) > 0))
-        {
-            ScatterTwoPhase<LAST_TILE, FIRST_TILE>(
-                num_remaining,
-                keys,
-                values_and_segments,
-                flags,
-                tile_num_flags,
-                tile_num_flags_prefix);
-        }
-        else
-        {
-            ScatterDirect<LAST_TILE, FIRST_TILE>(
-                num_remaining,
-                keys,
-                values_and_segments,
-                flags,
-                tile_num_flags,
-                Int2Type<0>());
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic domino scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ ValueOffsetPair ConsumeTile(
-        Offset              num_items,          ///< Total number of global input items
-        Offset              num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState  &tile_status)       ///< Global list of tile status
-    {
-            Key                 keys[ITEMS_PER_THREAD];                         // Tile keys
-            Value               values[ITEMS_PER_THREAD];                       // Tile values
-            Offset              flags[ITEMS_PER_THREAD];                        // Segment head flags
-            ValueOffsetPair     values_and_segments[ITEMS_PER_THREAD];          // Zipped values and segment flags|indices
-
-        ValueOffsetPair     running_total;                                  // Running count of segments and current value aggregate (including this tile)
-
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load keys and values
-            if (LAST_TILE)
-            {
-                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
-            }
-            else
-            {
-                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
-            }
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Load values
-            if (LAST_TILE)
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
-            else
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Set head flags.  First tile sets the first flag for the first item
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op);
-
-            // Zip values and flags
-            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
-
-            // Exclusive scan of values and flags
-            ValueOffsetPair block_aggregate;
-            ScanBlock(values_and_segments, block_aggregate, Int2Type<HAS_IDENTITY_ZERO>());
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, block_aggregate);
-
-            // Set offset for first scan output
-            if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0))
-                values_and_segments[0].offset = 0;
-
-            running_total = block_aggregate;
-
-            // Scatter flagged items
-            Scatter<LAST_TILE, true>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0);
-        }
-        else
-        {
-            // Not first tile
-
-            // Load keys and values
-            if (LAST_TILE)
-            {
-                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
-            }
-            else
-            {
-                BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
-            }
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Load values
-            if (LAST_TILE)
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
-            else
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Obtain the last key in the previous tile to compare with
-            Key tile_predecessor_key = (threadIdx.x == 0) ?
-                d_keys_in[block_offset - 1] :
-                ZeroInitialize<Key>();
-
-            // Set head flags
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key);
-
-            // Zip values and flags
-            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
-
-            // Exclusive scan of values and flags
-            ValueOffsetPair block_aggregate;
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
-
-            ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type<HAS_IDENTITY_ZERO>());
-            running_total = prefix_op.inclusive_prefix;
-
-            // Scatter flagged items
-            Scatter<LAST_TILE, false>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset);
-        }
-
-        return running_total;
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic domino scan
-     */
-    template <typename NumSegmentsIterator>         ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState      &tile_status,       ///< Global list of tile status
-        NumSegmentsIterator     d_num_segments)     ///< Output pointer for total number of segments identified
-    {
-#if (CUB_PTX_ARCH <= 130)
-        // Blocks are launched in increasing order, so just assign one tile per block
-
-        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Output the total number of items selected
-            if (threadIdx.x == 0)
-            {
-                *d_num_segments = running_total.offset;
-
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    d_values_out[running_total.offset - 1] = running_total.value;
-                }
-            }
-        }
-#else
-        // Blocks may not be launched in increasing order, so work-steal tiles
-
-        // Get first tile index
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int     tile_idx        = temp_storage.tile_idx;
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;    // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;         // Remaining items (including this tile)
-
-        while (num_remaining > TILE_ITEMS)
-        {
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Consume full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Get tile index
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx        = temp_storage.tile_idx;
-            block_offset    = Offset(TILE_ITEMS) * tile_idx;
-            num_remaining   = num_items - block_offset;
-        }
-
-        if (num_remaining > 0)
-        {
-            // Consume last tile (treat as partially-full)
-            ValueOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            if ((threadIdx.x == 0))
-            {
-                // Output the total number of items selected
-                *d_num_segments = running_total.offset;
-
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    d_values_out[running_total.offset - 1] = running_total.value;
-                }
-            }
-        }
-#endif
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh
deleted file mode 100644
index 77d44d114..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_scan.cuh
+++ /dev/null
@@ -1,538 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeScan
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    bool                        _LOAD_WARP_TIME_SLICING,        ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockRangeScanPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM    = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
- */
-template <
-    typename BlockRangeScanPolicy,      ///< Parameterized BlockRangeScanPolicy tuning policy type
-    typename InputIterator,             ///< Random-access input iterator type
-    typename OutputIterator,            ///< Random-access output iterator type
-    typename ScanOp,                    ///< Scan functor type
-    typename Identity,                  ///< Identity element type (cub::NullType for inclusive scan)
-    typename Offset>                    ///< Signed integer type for global offsets
-struct BlockRangeScan
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<T> ScanTileState;
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeScanPolicy::LOAD_MODIFIER, T, Offset>,    // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                            // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Constants
-    enum
-    {
-        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
-        BLOCK_THREADS       = BlockRangeScanPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeScanPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            WrappedInputIterator,
-            BlockRangeScanPolicy::BLOCK_THREADS,
-            BlockRangeScanPolicy::ITEMS_PER_THREAD,
-            BlockRangeScanPolicy::LOAD_ALGORITHM,
-            BlockRangeScanPolicy::LOAD_WARP_TIME_SLICING>
-        BlockLoadT;
-
-    // Parameterized BlockStore type
-    typedef BlockStore<
-            OutputIterator,
-            BlockRangeScanPolicy::BLOCK_THREADS,
-            BlockRangeScanPolicy::ITEMS_PER_THREAD,
-            BlockRangeScanPolicy::STORE_ALGORITHM,
-            BlockRangeScanPolicy::STORE_WARP_TIME_SLICING>
-        BlockStoreT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            T,
-            BlockRangeScanPolicy::BLOCK_THREADS,
-            BlockRangeScanPolicy::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            T,
-            ScanOp,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef BlockScanRunningPrefixOp<
-            T,
-            ScanOp>
-        RunningPrefixCallbackOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
-            typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
-            struct
-            {
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;     // Smem needed for cooperative prefix callback
-                typename BlockScanT::TempStorage                scan;       // Smem needed for tile scanning
-            };
-        };
-
-        Offset tile_idx;   // Shared tile index
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;      ///< Reference to temp_storage
-    WrappedInputIterator        d_in;               ///< Input data
-    OutputIterator              d_out;              ///< Output data
-    ScanOp                      scan_op;            ///< Binary scan operator
-    Identity                    identity;           ///< Identity element
-
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (first tile)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization
-     */
-    template <typename _ScanOp, typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
-    }
-
-    /**
-     * Exclusive sum specialization
-     */
-    template <typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
-    }
-
-    /**
-     * Inclusive scan specialization
-     */
-    template <typename _ScanOp>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-    /**
-     * Inclusive sum specialization
-     */
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
-    }
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (subsequent tiles)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Exclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockRangeScan(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIterator               d_in,               ///< Input data
-        OutputIterator              d_out,              ///< Output data
-        ScanOp                      scan_op,            ///< Binary scan operator
-        Identity                    identity)           ///< Identity element
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        identity(identity)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic domino scan)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset                      num_items,          ///< Total number of input items
-        Offset                      num_remaining,      ///< Total number of items remaining to be processed (including this tile)
-        int                         tile_idx,           ///< Tile index
-        Offset                      block_offset,       ///< Tile offset
-        ScanTileState          &tile_status)       ///< Global list of tile status
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-        __syncthreads();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate);
-
-            // Update tile status if there may be successor tiles (i.e., this tile is full)
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            T block_aggregate;
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic domino scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_items,          ///< Total number of input items
-        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState      &tile_status)       ///< Global list of tile status
-    {
-#if (CUB_PTX_ARCH <= 130)
-        // Blocks are launched in increasing order, so just assign one tile per block
-
-        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (block_offset + TILE_ITEMS <= num_items)
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        else if (block_offset < num_items)
-            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-#else
-        // Blocks may not be launched in increasing order, so work-steal tiles
-
-        // Get first tile index
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int     tile_idx        = temp_storage.tile_idx;
-        Offset  block_offset    = TILE_ITEMS * tile_idx;
-        Offset  num_remaining   = num_items - block_offset;
-
-        while (num_remaining >= TILE_ITEMS)
-        {
-            // Consume full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx        = temp_storage.tile_idx;
-            block_offset    = TILE_ITEMS * tile_idx;
-            num_remaining   = num_items - block_offset;
-        }
-
-        // Consume the last (and potentially partially-full) tile
-        if (num_remaining > 0)
-        {
-            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-
-#endif
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scan an sequence of consecutive tiles (independent of other thread blocks)
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool                FULL_TILE,
-        bool                FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset                      block_offset,               ///< Tile offset
-        RunningPrefixCallbackOp     &prefix_op,                 ///< Running prefix operator
-        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (FULL_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
-
-        __syncthreads();
-
-        // Block scan
-        if (FIRST_TILE)
-        {
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate);
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (FULL_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset   block_offset,      ///< [in] Threadblock begin offset (inclusive)
-        Offset   block_end)         ///< [in] Threadblock end offset (exclusive)
-    {
-        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
-
-        if (block_offset + TILE_ITEMS <= block_end)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ConsumeTile<true, false>(block_offset, prefix_op);
-                block_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (block_offset < block_end)
-            {
-                int valid_items = block_end - block_offset;
-                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
-        T       prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
-
-        // Consume full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ConsumeTile<true, false>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh b/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh
deleted file mode 100644
index 59fb5ce2f..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_range_select.cuh
+++ /dev/null
@@ -1,735 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRangeSelect
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _TWO_PHASE_SCATTER,             ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockRangeSelectPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        TWO_PHASE_SCATTER       = _TWO_PHASE_SCATTER,           ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRangeSelect implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename    BlockRangeSelectPolicy,         ///< Parameterized BlockRangeSelectPolicy tuning policy type
-    typename    InputIterator,                  ///< Random-access input iterator type for selection items
-    typename    FlagIterator,                   ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    OutputIterator,                 ///< Random-access input iterator type for selected items
-    typename    SelectOp,                       ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOp,                     ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    Offset,                         ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct BlockRangeSelect
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Data type of flag iterator
-    typedef typename std::iterator_traits<FlagIterator>::value_type Flag;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<Offset> ScanTileState;
-
-    // Constants
-    enum
-    {
-        USE_SELECT_OP,
-        USE_SELECT_FLAGS,
-        USE_DISCONTINUITY,
-
-        BLOCK_THREADS       = BlockRangeSelectPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeSelectPolicy::ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (BlockRangeSelectPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not to sync after loading data
-        SYNC_AFTER_LOAD     = (BlockRangeSelectPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        SELECT_METHOD       = (!Equals<SelectOp, NullType>::VALUE) ?
-                                USE_SELECT_OP :
-                                (!Equals<Flag, NullType>::VALUE) ?
-                                    USE_SELECT_FLAGS :
-                                    USE_DISCONTINUITY
-    };
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, T, Offset>,      // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                                // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Flag iterator wrapper type
-    typedef typename If<IsPointer<FlagIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRangeSelectPolicy::LOAD_MODIFIER, Flag, Offset>,   // Wrap the native input pointer with CacheModifiedInputIterator
-            FlagIterator>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedFlagIterator;
-
-    // Parameterized BlockLoad type for input items
-    typedef BlockLoad<
-            WrappedInputIterator,
-            BlockRangeSelectPolicy::BLOCK_THREADS,
-            BlockRangeSelectPolicy::ITEMS_PER_THREAD,
-            BlockRangeSelectPolicy::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockLoad type for flags
-    typedef BlockLoad<
-            WrappedFlagIterator,
-            BlockRangeSelectPolicy::BLOCK_THREADS,
-            BlockRangeSelectPolicy::ITEMS_PER_THREAD,
-            BlockRangeSelectPolicy::LOAD_ALGORITHM>
-        BlockLoadFlags;
-
-    // Parameterized BlockExchange type for input items
-    typedef BlockExchange<
-            T,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeT;
-
-    // Parameterized BlockDiscontinuity type for input items
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            Offset,
-            BlockRangeSelectPolicy::BLOCK_THREADS,
-            BlockRangeSelectPolicy::SCAN_ALGORITHM>
-        BlockScanAllocations;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            Offset,
-            Sum,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            struct
-            {
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;         // Smem needed for cooperative prefix callback
-                typename BlockScanAllocations::TempStorage      scan;           // Smem needed for tile scanning
-                typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage load_items;
-
-            // Smem needed for flag loading
-            typename BlockLoadFlags::TempStorage load_flags;
-
-            // Smem needed for two-phase scatter
-            typename If<TWO_PHASE_SCATTER, typename BlockExchangeT::TempStorage, NullType>::Type exchange;
-        };
-
-        Offset      tile_idx;                   // Shared tile index
-        Offset      tile_num_selected_prefix;   // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
-    WrappedInputIterator            d_in;               ///< Input data
-    WrappedFlagIterator             d_flags;            ///< Input flags
-    OutputIterator                  d_out;              ///< Output data
-    SelectOp                        select_op;          ///< Selection operator
-    InequalityWrapper<EqualityOp>   inequality_op;      ///< Inequality operator
-    Offset                          num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockRangeSelect(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIterator               d_in,               ///< Input data
-        FlagIterator                d_flags,            ///< Input flags
-        OutputIterator              d_out,              ///< Output data
-        SelectOp                    select_op,          ///< Selection operator
-        EqualityOp                  equality_op,        ///< Equality operator
-        Offset                      num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_flags(d_flags),
-        d_out(d_out),
-        select_op(select_op),
-        inequality_op(equality_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Template unrolled selection via selection operator
-     */
-    template <bool FIRST_TILE, bool LAST_TILE, int ITERATION>
-    __device__ __forceinline__ void ApplySelectionOp(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<ITERATION>         iteration)
-    {
-        selected[ITERATION] = 0;
-        if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining))
-            selected[ITERATION] = select_op(items[ITERATION]);
-
-        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>());
-    }
-
-    /**
-     * Template unrolled selection via selection operator
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void ApplySelectionOp(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<ITEMS_PER_THREAD>  iteration)
-    {}
-
-    /**
-     * Initialize selections (specialized for selection operator)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_OP>     select_method)
-    {
-        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>());
-    }
-
-
-    /**
-     * Initialize selections (specialized for valid flags)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_FLAGS>  select_method)
-    {
-        Flag flags[ITEMS_PER_THREAD];
-
-        if (LAST_TILE)
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0);
-        else
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            selected[ITEM] = flags[ITEM];
-        }
-
-        if (SYNC_AFTER_LOAD)
-            __syncthreads();
-    }
-
-
-    /**
-     * Initialize selections (specialized for discontinuity detection)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_DISCONTINUITY> select_method)
-    {
-        if (FIRST_TILE)
-        {
-            // First tile always flags the first item
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op);
-        }
-        else
-        {
-            // Subsequent tiles require the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[block_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Scatter data items to select offsets (specialized for direct scattering and for discarding rejected items)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          block_offset,
-        T               (&items)[ITEMS_PER_THREAD],
-        Offset          selected[ITEMS_PER_THREAD],
-        Offset          scatter_offsets[ITEMS_PER_THREAD],
-        Offset          tile_num_selected_prefix,
-        Offset          tile_num_selected,
-        Offset          num_remaining,
-        Int2Type<false> keep_rejects,
-        Int2Type<false> two_phase_scatter)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (selected[ITEM])
-            {
-                // Selected items are placed front-to-back
-                d_out[scatter_offsets[ITEM]] = items[ITEM];
-            }
-        }
-    }
-
-
-    /**
-     * Scatter data items to select offsets (specialized for direct scattering and for partitioning rejected items after selected items)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          block_offset,
-        T               (&items)[ITEMS_PER_THREAD],
-        Offset          selected[ITEMS_PER_THREAD],
-        Offset          scatter_offsets[ITEMS_PER_THREAD],
-        Offset          tile_num_selected_prefix,
-        Offset          tile_num_selected,
-        Offset          num_remaining,
-        Int2Type<true>  keep_rejects,
-        Int2Type<false> two_phase_scatter)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (selected[ITEM])
-            {
-                // Selected items are placed front-to-back
-                d_out[scatter_offsets[ITEM]] = items[ITEM];
-            }
-            else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
-            {
-                Offset global_idx = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                Offset reject_idx = global_idx - scatter_offsets[ITEM];
-
-                // Rejected items are placed back-to-front
-                d_out[num_items - reject_idx - 1] = items[ITEM];
-            }
-        }
-    }
-
-
-    /**
-     * Scatter data items to select offsets (specialized for two-phase scattering and for discarding rejected items)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          block_offset,
-        T               (&items)[ITEMS_PER_THREAD],
-        Offset          selected[ITEMS_PER_THREAD],
-        Offset          scatter_offsets[ITEMS_PER_THREAD],
-        Offset          tile_num_selected_prefix,
-        Offset          tile_num_selected,
-        Offset          num_remaining,
-        Int2Type<false> keep_rejects,
-        Int2Type<true>  two_phase_scatter)
-    {
-        if ((tile_num_selected >> Log2<BLOCK_THREADS>::VALUE) == 0)
-        {
-            // Average number of selected items per thread is less than one, so just do a one-phase scatter
-            Scatter<LAST_TILE>(
-                block_offset,
-                items,
-                selected,
-                scatter_offsets,
-                tile_num_selected_prefix,
-                tile_num_selected,
-                num_remaining,
-                keep_rejects,
-                Int2Type<false>());
-        }
-        else
-        {
-            // Share exclusive tile prefix
-            if (threadIdx.x == 0)
-            {
-                temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
-            }
-
-            __syncthreads();
-
-            // Load exclusive tile prefix in all threads
-            tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
-
-            int local_ranks[ITEMS_PER_THREAD];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
-            }
-
-            BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks, selected);
-
-            // Selected items are placed front-to-back
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + tile_num_selected_prefix, items, tile_num_selected);
-        }
-    }
-
-
-    /**
-     * Scatter data items to select offsets (specialized for two-phase scattering and for partitioning rejected items after selected items)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          block_offset,
-        T               (&items)[ITEMS_PER_THREAD],
-        Offset          selected[ITEMS_PER_THREAD],
-        Offset          scatter_offsets[ITEMS_PER_THREAD],
-        Offset          tile_num_selected_prefix,
-        Offset          tile_num_selected,
-        Offset          num_remaining,
-        Int2Type<true>  keep_rejects,
-        Int2Type<true>  two_phase_scatter)
-    {
-        // Share exclusive tile prefix
-        if (threadIdx.x == 0)
-        {
-            temp_storage.tile_num_selected_prefix = tile_num_selected_prefix;
-        }
-
-        __syncthreads();
-
-        // Load the exclusive tile prefix in all threads
-        tile_num_selected_prefix = temp_storage.tile_num_selected_prefix;
-
-        // Determine the exclusive prefix for rejects
-        Offset tile_rejected_exclusive_prefix = block_offset - tile_num_selected_prefix;
-
-        // Determine local scatter offsets
-        int local_ranks[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM]   = -1;
-            Offset global_idx   = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-            Offset reject_idx   = global_idx - scatter_offsets[ITEM];
-
-            if (selected[ITEM])
-            {
-                // Selected items
-                local_ranks[ITEM] = scatter_offsets[ITEM] - tile_num_selected_prefix;
-            }
-            else if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_remaining))
-            {
-                // Rejected items
-                local_ranks[ITEM] = (reject_idx - tile_rejected_exclusive_prefix) + tile_num_selected;
-            }
-        }
-
-        // Coalesce selected and rejected items in shared memory, gathering in striped arrangements
-        if (LAST_TILE)
-            BlockExchangeT(temp_storage.exchange).ScatterToStripedGuarded(items, local_ranks);
-        else
-            BlockExchangeT(temp_storage.exchange).ScatterToStriped(items, local_ranks);
-
-        // Store in striped order
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            Offset local_idx = (ITEM * BLOCK_THREADS) + threadIdx.x;
-            Offset scatter_offset = tile_num_selected_prefix + local_idx;
-            if (local_idx >= tile_num_selected)
-                scatter_offset = num_items - (tile_rejected_exclusive_prefix + (local_idx - tile_num_selected)) - 1;
-
-            if (!LAST_TILE || (local_idx < num_remaining))
-            {
-                d_out[scatter_offset] = items[ITEM];
-            }
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic domino scan)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ Offset ConsumeTile(
-        Offset              num_items,          ///< Total number of input items
-        Offset              num_remaining,      ///< Total number of items remaining to be processed (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState  &tile_status)       ///< Global list of tile status
-    {
-        T items[ITEMS_PER_THREAD];
-        Offset selected[ITEMS_PER_THREAD];              // Selection flags
-        Offset scatter_offsets[ITEMS_PER_THREAD];       // Scatter offsets
-        Offset tile_num_selected_prefix;                // Total number of selected items prior to this tile
-        Offset tile_num_selected;                       // Total number of selected items within this tile
-        Offset num_selected;                            //
-
-        // Load items
-        if (LAST_TILE)
-            BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, d_in[num_items - 1]);     // Repeat last item
-        else
-            BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items);
-
-        if (SYNC_AFTER_LOAD)
-            __syncthreads();
-
-        if (tile_idx == 0)
-        {
-            // Initialize selected/rejected output flags for first tile
-            InitializeSelections<true, LAST_TILE>(
-                block_offset,
-                num_remaining,
-                items,
-                selected,
-                Int2Type<SELECT_METHOD>());
-
-            // Compute scatter offsets by scanning the flags
-            BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected);
-
-            // Update tile status if there may be successor tiles
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_num_selected);
-
-            tile_num_selected_prefix = 0;
-            num_selected = tile_num_selected;
-        }
-        else
-        {
-            // Initialize selected/rejected output flags for non-first tile
-            InitializeSelections<false, LAST_TILE>(
-                block_offset,
-                num_remaining,
-                items,
-                selected,
-                Int2Type<SELECT_METHOD>());
-
-            // Compute scatter offsets by scanning the flags
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx);
-            BlockScanAllocations(temp_storage.scan).ExclusiveSum(selected, scatter_offsets, tile_num_selected, prefix_op);
-
-            tile_num_selected_prefix = prefix_op.exclusive_prefix;
-            num_selected = prefix_op.inclusive_prefix;
-        }
-
-        // Store selected items
-        Scatter<LAST_TILE>(
-            block_offset,
-            items,
-            selected,
-            scatter_offsets,
-            tile_num_selected_prefix,
-            tile_num_selected,
-            num_remaining,
-            Int2Type<KEEP_REJECTS>(),
-            Int2Type<TWO_PHASE_SCATTER>());
-
-        // Return total number of items selected (inclusive of this tile)
-        return num_selected;
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic domino scan
-     */
-    template <typename NumSelectedIterator>         ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState      &tile_status,       ///< Global list of tile status
-        NumSelectedIterator     d_num_selected)     ///< Output total number selected
-    {
-#if (CUB_PTX_ARCH <= 130)
-        // Blocks are launched in increasing order, so just assign one tile per block
-
-        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Output the total number of items selected
-            if (threadIdx.x == 0)
-            {
-                *d_num_selected = total_selected;
-            }
-        }
-
-#else
-        // Blocks may not be launched in increasing order, so work-steal tiles
-
-        // Get first tile index
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int     tile_idx        = temp_storage.tile_idx;
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;
-        Offset  num_remaining   = num_items - block_offset;
-
-        while (num_remaining > TILE_ITEMS)
-        {
-            // Consume full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx        = temp_storage.tile_idx;
-            block_offset    = Offset(TILE_ITEMS) * tile_idx;
-            num_remaining   = num_items - block_offset;
-        }
-
-        // Consume the last (and potentially partially-full) tile
-        if (num_remaining > 0)
-        {
-            Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Output the total number of items selected
-            if (threadIdx.x == 0)
-            {
-                *d_num_selected = total_selected;
-            }
-        }
-
-#endif
-
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh b/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh
deleted file mode 100644
index ba72cc2ee..000000000
--- a/thrust/system/cuda/detail/cub/block_range/block_scan_prefix_operators.cuh
+++ /dev/null
@@ -1,566 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Callback operator types for supplying BlockScan prefixes
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../util_arch.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a region
- ******************************************************************************/
-
-/**
- * Stateful callback operator type for supplying BlockScan prefixes.
- * Maintains a running prefix that can be applied to consecutive
- * BlockScan operations.
- */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOp>            ///< Wrapped scan operator type
-struct BlockScanRunningPrefixOp
-{
-    ScanOp  op;                 ///< Wrapped scan operator
-    T       running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOp op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
-};
-
-
-/******************************************************************************
- * Bookkeeping and prefix functor types for single-pass device-wide scan with dynamic lookback
- ******************************************************************************/
-
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID,      // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
-};
-
-
-/**
- * Tile status interface.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
-struct ScanTileState;
-
-
-/**
- * Tile status interface specialized for scan status and value types
- * that can be combined into one machine word that can be
- * read/written coherently in a single access.
- */
-template <typename T>
-struct ScanTileState<T, true>
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-
-    // Unit word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                uchar2>::Type>::Type>::Type TxnWord;
-
-
-    // Device word type
-    struct TileDescriptor
-    {
-        StatusWord  status;
-        T           value;
-    };
-
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-
-    // Device storage
-    TileDescriptor *d_tile_status;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        // Use warp-any to determine when all threads have valid status
-        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        while ((tile_descriptor.status == SCAN_TILE_INVALID))
-        {
-            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-        }
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-
-/**
- * Tile status interface specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <typename T>
-struct ScanTileState<T, false>
-{
-    // Status word type
-    typedef char StatusWord;
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Device storage
-    StatusWord  *d_tile_status;
-    T           *d_tile_partial;
-    T           *d_tile_inclusive;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL),
-        d_tile_partial(NULL),
-        d_tile_inclusive(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            void*   allocations[3];
-            size_t  allocation_sizes[3];
-
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
-
-            // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Alias the offsets
-            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
-            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
-            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        // Specify storage allocation requirements
-        size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
-
-        // Set the necessary size of the blob
-        void* allocations[3];
-        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        // Update tile inclusive value
-        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        // Update tile partial value
-        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-        while (status == SCAN_TILE_INVALID)
-        {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-        }
-
-        T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
-
-        value = (status == StatusWord(SCAN_TILE_PARTIAL)) ?
-            partial :
-            inclusive;
-
-    }
-};
-
-
-
-/**
- * Stateful block-scan prefix functor.  Provides the the running prefix for
- * the current tile by using the call-back warp to wait on on
- * aggregates/prefixes from predecessor tiles to become available.
- */
-template <
-    typename T,
-    typename ScanOp,
-    typename ScanTileState>
-struct BlockScanLookbackPrefixOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T> WarpReduceT;
-
-    // Temporary storage type
-    typedef typename WarpReduceT::TempStorage _TempStorage;
-
-    // Alias wrapper allowing temporary storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-    // Type of status word
-    typedef typename ScanTileState::StatusWord StatusWord;
-
-    // Scan operator for switching the scan arguments
-    struct SwizzleScanOp
-    {
-        ScanOp scan_op;
-
-        // Constructor
-        __host__ __device__ __forceinline__
-        SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
-
-        // Switch the scan arguments
-        __host__ __device__ __forceinline__
-        T operator()(const T &a, const T &b)
-        {
-            return scan_op(b, a);
-        }
-    };
-
-    // Fields
-    ScanTileState               &tile_status;       ///< Interface to tile status
-    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
-    ScanOp                      scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    BlockScanLookbackPrefixOp(
-        ScanTileState      &tile_status,
-        TempStorage             &temp_storage,
-        ScanOp                  scan_op,
-        int                     tile_idx)
-    :
-        tile_status(tile_status),
-        temp_storage(temp_storage.Alias()),
-        scan_op(scan_op),
-        tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the warp-wide window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
-    {
-        T value;
-        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window.
-        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
-
-        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-
-        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(
-            value,
-            tail_flag,
-            SwizzleScanOp(scan_op));
-    }
-
-
-    // BlockScan prefix callback functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            tile_status.SetPartial(tile_idx, block_aggregate);
-        }
-
-        int         predecessor_idx = tile_idx - threadIdx.x - 1;
-        StatusWord  predecessor_status;
-        T           window_aggregate;
-
-        // Wait for the warp-wide window of predecessor tiles to become valid
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))
-        {
-            predecessor_idx -= CUB_PTX_WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            tile_status.SetInclusive(tile_idx, inclusive_prefix);
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh b/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh
deleted file mode 100644
index ccfbd6430..000000000
--- a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_gatomic.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * BlockRangeHistogramGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
- */
-template <
-    typename    BlockRangeHistogramPolicy,      ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockRangeHistogramGlobalAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    // Shared memory type required by this thread block
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeHistogramGlobalAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {}
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {}
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh b/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh
deleted file mode 100644
index 8c6256955..000000000
--- a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_satomic.cuh
+++ /dev/null
@@ -1,245 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockRangeHistogramSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-template <
-    typename    BlockRangeHistogramPolicy,		///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                	///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockRangeHistogramSharedAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockRangeHistogramPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeHistogramSharedAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int histo_offset = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-
-            __threadfence_block();
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Barrier to ensure shared memory histograms are coherent
-        __syncthreads();
-
-        // Copy shared memory histograms to output
-        int channel_offset = (blockIdx.x * BINS);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int histo_offset = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
-            }
-
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh b/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh
deleted file mode 100644
index c28d1a74f..000000000
--- a/thrust/system/cuda/detail/cub/block_range/specializations/block_range_histo_sort.cuh
+++ /dev/null
@@ -1,364 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockRangeHistogramSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-template <
-    typename    BlockRangeHistogramPolicy,          ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockRangeHistogramSort
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS               = BlockRangeHistogramPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD            = BlockRangeHistogramPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
-
-        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
-
-    /// Shared memory type required by this thread block
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Histogram counters striped across threads
-    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRangeHistogramSort(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram counters striped across threads
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                thread_counters[CHANNEL][COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Composite a tile of input items
-     */
-    __device__ __forceinline__ void Composite(
-        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
-        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
-    {
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        __syncthreads();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-        }
-
-        __syncthreads();
-
-        // Note the begin/end run offsets of bin runs in the sorted tile
-        int flags[ITEMS_PER_THREAD];                // unused
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
-
-        __syncthreads();
-
-        // Composite into histogram
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
-
-            thread_counters[COUNTER] += run_length;
-        }
-    }
-
-
-    /**
-     * Process one channel within a tile.
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTileChannel(
-        int     channel,
-        Offset   block_offset,
-        int     valid_items)
-    {
-        // Load items in striped fashion
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Unguarded loads
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
-            int bounds = (valid_items - (threadIdx.x * CHANNELS));
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
-                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
-                    0;
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-
-            __syncthreads();
-
-            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
-            if (threadIdx.x == 0)
-            {
-                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
-                thread_counters[channel][0] -= extra;
-            }
-        }
-    }
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
-     */
-    template <bool FULL_TILE, int CHANNEL, int END>
-    struct IterateChannels
-    {
-        /**
-         * Process one channel within a tile.
-         */
-        static __device__ __forceinline__ void ConsumeTileChannel(
-            BlockRangeHistogramSort *cta,
-            Offset               block_offset,
-            int                 valid_items)
-        {
-            __syncthreads();
-
-            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
-
-            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
-        }
-    };
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
-     */
-    template <bool FULL_TILE, int END>
-    struct IterateChannels<FULL_TILE, END, END>
-    {
-        static __device__ __forceinline__ void ConsumeTileChannel(BlockRangeHistogramSort *cta, Offset block_offset, int valid_items) {}
-    };
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        // First channel
-        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
-
-        // Iterate through remaining channels
-        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Copy counters striped across threads into the histogram output
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_offset  = (blockIdx.x * BINS);
-
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-
-                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
-                {
-                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
-                }
-            }
-        }
-    }
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh
deleted file mode 100644
index 45483150e..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_histogram_sweep.cuh
+++ /dev/null
@@ -1,319 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "specializations/block_histogram_gatomic_sweep.cuh"
-#include "specializations/block_histogram_satomic_sweep.cuh"
-#include "specializations/block_histogram_sort_sweep.cuh"
-#include "../util_type.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Algorithmic variants
- ******************************************************************************/
-
-
-/**
- * \brief DeviceHistogramAlgorithm enumerates alternative algorithms for BlockHistogramSweep.
- */
-enum DeviceHistogramAlgorithm
-{
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using block-wide sorting (see BlockHistogramAlgorithm::BLOCK_HISTO_SORT).
-     * -# A single thread block in the second kernel reduces them into the output histogram(s).
-     *
-     * \par Performance Considerations
-     * Delivers consistent throughput regardless of sample bin distribution.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    DEVICE_HISTO_SORT,
-
-
-    /**
-     * \par Overview
-     * A two-kernel approach in which:
-     * -# Thread blocks in the first kernel aggregate their own privatized
-     *    histograms using shared-memory \p atomicAdd().
-     * -# A single thread block in the second kernel reduces them into the
-     *    output histogram(s).
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * However, because histograms are privatized in shared memory, a large
-     * number of bins (e.g., thousands) may adversely affect occupancy and
-     * performance (or even the ability to launch).
-     */
-    DEVICE_HISTO_SHARED_ATOMIC,
-
-
-    /**
-     * \par Overview
-     * A single-kernel approach in which thread blocks update the output histogram(s) directly
-     * using global-memory \p atomicAdd().
-     *
-     * \par Performance Considerations
-     * Performance is strongly tied to the hardware implementation of atomic
-     * addition, and may be significantly degraded for non uniformly-random
-     * input distributions where many concurrent updates are likely to be
-     * made to the same bin counter.
-     *
-     * Performance is not significantly impacted when computing histograms having large
-     * numbers of bins (e.g., thousands).
-     */
-    DEVICE_HISTO_GLOBAL_ATOMIC,
-
-};
-
-
-/******************************************************************************
- * Tuning policy
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockHistogramSweep
- */
-template <
-    int                             _BLOCK_THREADS,         ///< Threads per thread block
-    int                             _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    DeviceHistogramAlgorithm        _HISTO_ALGORITHM,       ///< Cooperative histogram algorithm to use
-    GridMappingStrategy             _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
-struct BlockHistogramSweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-    };
-
-    static const DeviceHistogramAlgorithm   HISTO_ALGORITHM     = _HISTO_ALGORITHM;     ///< Cooperative histogram algorithm to use
-    static const GridMappingStrategy        GRID_MAPPING        = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockHistogramSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles.
- */
-template <
-    typename    BlockHistogramSweepPolicy,      ///< Parameterized BlockHistogramSweepPolicy tuning policy type
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                  ///< Random-access input iterator type for reading samples.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                         ///< Signed integer type for global offsets
-struct BlockHistogramSweep
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Histogram grid algorithm
-    static const DeviceHistogramAlgorithm HISTO_ALGORITHM = BlockHistogramSweepPolicy::HISTO_ALGORITHM;
-
-    // Alternative internal implementation types
-    typedef BlockHistogramSweepSort<            BlockHistogramSweepPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockHistogramSweepSortT;
-    typedef BlockHistogramSweepSharedAtomic<    BlockHistogramSweepPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockHistogramSweepSharedAtomicT;
-    typedef BlockHistogramSweepGlobalAtomic<    BlockHistogramSweepPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>   BlockHistogramSweepGlobalAtomicT;
-
-    // Internal block sweep histogram type
-    typedef typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SORT),
-        BlockHistogramSweepSortT,
-        typename If<(HISTO_ALGORITHM == DEVICE_HISTO_SHARED_ATOMIC),
-            BlockHistogramSweepSharedAtomicT,
-            BlockHistogramSweepGlobalAtomicT>::Type>::Type InternalBlockDelegate;
-
-    enum
-    {
-        TILE_ITEMS = InternalBlockDelegate::TILE_ITEMS,
-    };
-
-
-    // Temporary storage type
-    typedef typename InternalBlockDelegate::TempStorage TempStorage;
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    // Internal block delegate
-    InternalBlockDelegate internal_delegate;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramSweep(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        internal_delegate(temp_storage, d_in, d_out_histograms)
-    {}
-
-
-    /**
-     * \brief Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset   block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset   block_end)                          ///< [in] Threadblock end offset (exclusive)
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Reduce a consecutive segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                              num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        even_share.BlockInit();
-        ConsumeRange(even_share.block_offset, even_share.block_end);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<Offset>   queue)              ///< Queue descriptor for assigning tiles of work to thread blocks
-    {
-        // Shared block offset
-        __shared__ Offset shared_block_offset;
-
-        // We give each thread block at least one tile of input.
-        Offset block_offset      = blockIdx.x * TILE_ITEMS;
-        Offset even_share_base   = gridDim.x * TILE_ITEMS;
-
-        // Process full tiles of input
-        while (block_offset + TILE_ITEMS <= num_items)
-        {
-            internal_delegate.ConsumeTile<true>(block_offset);
-
-            // Dequeue up to TILE_ITEMS
-            if (threadIdx.x == 0)
-                shared_block_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-            __syncthreads();
-
-            block_offset = shared_block_offset;
-
-            __syncthreads();
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            internal_delegate.ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Aggregate output
-        internal_delegate.AggregateOutput();
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                          num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeRange(num_items, queue);
-    }
-
-
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh
deleted file mode 100644
index aae4ff1b0..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_downsweep.cuh
+++ /dev/null
@@ -1,744 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
- */
-
-
-#pragma once
-
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_radix_rank.cuh"
-#include "../block/block_exchange.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Types of scattering strategies
- */
-enum RadixSortScatterAlgorithm
-{
-    RADIX_SORT_SCATTER_DIRECT,      ///< Scatter directly from registers to global bins
-    RADIX_SORT_SCATTER_TWO_PHASE,   ///< First scatter from registers into shared memory bins, then into global bins
-};
-
-
-/**
- * Parameterizable tuning policy type for BlockRadixSortDownsweep
- */
-template <
-    int                         _BLOCK_THREADS,             ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,          ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,            ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,             ///< Cache load modifier for reading keys (and values)
-    bool                        _EXCHANGE_TIME_SLICING,     ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
-    bool                        _MEMOIZE_OUTER_SCAN,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
-    BlockScanAlgorithm          _INNER_SCAN_ALGORITHM,      ///< The BlockScan algorithm algorithm to use
-    RadixSortScatterAlgorithm   _SCATTER_ALGORITHM,         ///< The scattering strategy to use
-    cudaSharedMemConfig         _SMEM_CONFIG,               ///< Shared memory bank mode
-    int                         _RADIX_BITS>                ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortDownsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
-        EXCHANGE_TIME_SLICING   = _EXCHANGE_TIME_SLICING,   ///< Whether or not to time-slice key/value exchanges through shared memory to lower shared memory pressure
-        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
-        MEMOIZE_OUTER_SCAN      = _MEMOIZE_OUTER_SCAN,      ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure.  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
-    };
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier          LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading keys (and values)
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = _INNER_SCAN_ALGORITHM;    ///< The BlockScan algorithm algorithm to use
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = _SCATTER_ALGORITHM;       ///< The scattering strategy to use
-    static const cudaSharedMemConfig        SMEM_CONFIG             = _SMEM_CONFIG;             ///< Shared memory bank mode
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep across a range of tiles.
- */
-template <
-    typename BlockRadixSortDownsweepPolicy,        ///< Parameterized BlockRadixSortDownsweepPolicy tuning policy type
-    bool     DESCENDING,                                   ///< Whether or not the sorted-order is high-to-low
-    typename Key,                                       ///< Key type
-    typename Value,                                     ///< Value type
-    typename Offset>                                    ///< Signed integer type for global offsets
-struct BlockRadixSortDownsweep
-{
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    // Appropriate unsigned-bits representation of Key
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    static const UnsignedBits MIN_KEY = Traits<Key>::MIN_KEY;
-    static const UnsignedBits MAX_KEY = Traits<Key>::MAX_KEY;
-
-    static const BlockLoadAlgorithm         LOAD_ALGORITHM          = BlockRadixSortDownsweepPolicy::LOAD_ALGORITHM;
-    static const CacheLoadModifier          LOAD_MODIFIER           = BlockRadixSortDownsweepPolicy::LOAD_MODIFIER;
-    static const BlockScanAlgorithm         INNER_SCAN_ALGORITHM    = BlockRadixSortDownsweepPolicy::INNER_SCAN_ALGORITHM;
-    static const RadixSortScatterAlgorithm  SCATTER_ALGORITHM       = BlockRadixSortDownsweepPolicy::SCATTER_ALGORITHM;
-    static const cudaSharedMemConfig        SMEM_CONFIG             = BlockRadixSortDownsweepPolicy::SMEM_CONFIG;
-
-    enum
-    {
-        BLOCK_THREADS           = BlockRadixSortDownsweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = BlockRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING   = BlockRadixSortDownsweepPolicy::EXCHANGE_TIME_SLICING,
-        RADIX_BITS              = BlockRadixSortDownsweepPolicy::RADIX_BITS,
-        MEMOIZE_OUTER_SCAN      = BlockRadixSortDownsweepPolicy::MEMOIZE_OUTER_SCAN,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-        KEYS_ONLY               = Equals<Value, NullType>::VALUE,
-
-        WARP_THREADS            = CUB_PTX_LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        BYTES_PER_SIZET         = sizeof(Offset),
-        LOG_BYTES_PER_SIZET     = Log2<BYTES_PER_SIZET>::VALUE,
-
-        LOG_SMEM_BANKS          = CUB_PTX_LOG_SMEM_BANKS,
-        SMEM_BANKS              = 1 << LOG_SMEM_BANKS,
-
-        DIGITS_PER_SCATTER_PASS = BLOCK_THREADS / SMEM_BANKS,
-        SCATTER_PASSES          = RADIX_DIGITS / DIGITS_PER_SCATTER_PASS,
-
-        LOG_STORE_TXN_THREADS   = LOG_SMEM_BANKS,
-        STORE_TXN_THREADS       = 1 << LOG_STORE_TXN_THREADS,
-    };
-
-    // Input iterator wrapper types
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, Value, Offset>         ValuesItr;
-
-    // BlockRadixRank type
-    typedef BlockRadixRank<
-        BLOCK_THREADS,
-        RADIX_BITS,
-        DESCENDING,
-        MEMOIZE_OUTER_SCAN,
-        INNER_SCAN_ALGORITHM,
-        SMEM_CONFIG> BlockRadixRank;
-
-    // BlockLoad type (keys)
-    typedef BlockLoad<
-        KeysItr,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        EXCHANGE_TIME_SLICING> BlockLoadKeys;
-
-    // BlockLoad type (values)
-    typedef BlockLoad<
-        ValuesItr,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        LOAD_ALGORITHM,
-        EXCHANGE_TIME_SLICING> BlockLoadValues;
-
-    // BlockExchange type (keys)
-    typedef BlockExchange<
-        UnsignedBits,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeKeys;
-
-    // BlockExchange type (values)
-    typedef BlockExchange<
-        Value,
-        BLOCK_THREADS,
-        ITEMS_PER_THREAD,
-        EXCHANGE_TIME_SLICING> BlockExchangeValues;
-
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        Offset  relative_bin_offsets[RADIX_DIGITS + 1];
-        bool    short_circuit;
-
-        union
-        {
-            typename BlockRadixRank::TempStorage        ranking;
-            typename BlockLoadKeys::TempStorage         load_keys;
-            typename BlockLoadValues::TempStorage       load_values;
-            typename BlockExchangeKeys::TempStorage     exchange_keys;
-            typename BlockExchangeValues::TempStorage   exchange_values;
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-    ValuesItr       d_values_in;
-    UnsignedBits    *d_keys_out;
-    Value           *d_values_out;
-
-    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
-    Offset          bin_offset;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-    // Whether to short-ciruit
-    bool            short_circuit;
-
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decodes given keys to lookup digit offsets in shared memory
-     */
-    __device__ __forceinline__ void DecodeRelativeBinOffsets(
-        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset          (&relative_bin_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            UnsignedBits digit = BFE(twiddled_keys[KEY], current_bit, num_bits);
-
-            // Lookup base digit offset from shared memory
-            relative_bin_offsets[KEY] = temp_storage.relative_bin_offsets[digit];
-        }
-    }
-
-
-    /**
-     * Scatter ranked items to global memory
-     */
-    template <bool FULL_TILE, typename T>
-    __device__ __forceinline__ void ScatterItems(
-        T       (&items)[ITEMS_PER_THREAD],
-        int     (&local_ranks)[ITEMS_PER_THREAD],
-        Offset  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        T       *d_out,
-        Offset  valid_items)
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Scatter if not out-of-bounds
-            if (FULL_TILE || (local_ranks[ITEM] < valid_items))
-            {
-                d_out[relative_bin_offsets[ITEM] + local_ranks[ITEM]] = items[ITEM];
-            }
-        }
-    }
-
-
-    /**
-     * Scatter ranked keys directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Compute scatter offsets
-        DecodeRelativeBinOffsets(twiddled_keys, relative_bin_offsets);
-
-        // Untwiddle keys before outputting
-        UnsignedBits keys[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            keys[KEY] = Traits<Key>::TwiddleOut(twiddled_keys[KEY]);
-        }
-
-        // Scatter to global
-        ScatterItems<FULL_TILE>(keys, ranks, relative_bin_offsets, d_keys_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked keys through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterKeys(
-        UnsignedBits                            (&twiddled_keys)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        // Exchange keys through shared memory
-        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(twiddled_keys, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterKeys<FULL_TILE>(
-            twiddled_keys,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Scatter ranked values directly to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_DIRECT>     scatter_algorithm)
-    {
-        // Scatter to global
-        ScatterItems<FULL_TILE>(values, ranks, relative_bin_offsets, d_values_out, valid_items);
-    }
-
-
-    /**
-     * Scatter ranked values through shared memory, then to global memory
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ScatterValues(
-        Value                                   (&values)[ITEMS_PER_THREAD],
-        Offset                                  (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int                                     (&ranks)[ITEMS_PER_THREAD],
-        Offset                                  valid_items,
-        Int2Type<RADIX_SORT_SCATTER_TWO_PHASE>  scatter_algorithm)
-    {
-        __syncthreads();
-
-        // Exchange keys through shared memory
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
-
-        // Compute striped local ranks
-        int local_ranks[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = threadIdx.x + (ITEM * BLOCK_THREADS);
-        }
-
-        // Scatter directly
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            local_ranks,
-            valid_items,
-            Int2Type<RADIX_SORT_SCATTER_DIRECT>());
-    }
-
-
-    /**
-     * Load a tile of items (specialized for full tile)
-     */
-    template <typename BlockLoadT, typename T, typename InputIterator>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        InputIterator   d_in,
-        Offset          valid_items,
-        Int2Type<true>  is_full_tile)
-    {
-        block_loader.Load(d_in, items);
-    }
-
-
-    /**
-     * Load a tile of items (specialized for partial tile)
-     */
-    template <typename BlockLoadT, typename T, typename InputIterator>
-    __device__ __forceinline__ void LoadItems(
-        BlockLoadT      &block_loader, 
-        T               (&items)[ITEMS_PER_THREAD],
-        InputIterator   d_in,
-        Offset          valid_items,
-        Int2Type<false> is_full_tile)
-    {
-        block_loader.Load(d_in, items, valid_items);
-    }
-
-
-    /**
-     * Truck along associated values
-     */
-    template <bool FULL_TILE, typename _Value>
-    __device__ __forceinline__ void GatherScatterValues(
-        _Value      (&values)[ITEMS_PER_THREAD],
-        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        Offset      block_offset,
-        Offset      valid_items)
-    {
-        __syncthreads();
-
-        BlockLoadValues loader(temp_storage.load_values);
-        LoadItems(
-            loader,
-            values,
-            d_values_in + block_offset,
-            valid_items,
-            Int2Type<FULL_TILE>());
-
-        ScatterValues<FULL_TILE>(
-            values,
-            relative_bin_offsets,
-            ranks,
-            valid_items,
-            Int2Type<SCATTER_ALGORITHM>());
-    }
-
-
-    /**
-     * Truck along associated values (specialized for key-only sorting)
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void GatherScatterValues(
-        NullType    (&values)[ITEMS_PER_THREAD],
-        Offset      (&relative_bin_offsets)[ITEMS_PER_THREAD],
-        int         (&ranks)[ITEMS_PER_THREAD],
-        Offset      block_offset,
-        Offset      valid_items)
-    {}
-
-
-    /**
-     * Process tile
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ProcessTile(
-        Offset block_offset,
-        const Offset &valid_items = TILE_ITEMS)
-    {
-        // Per-thread tile data
-        UnsignedBits    keys[ITEMS_PER_THREAD];                     // Keys
-        UnsignedBits    twiddled_keys[ITEMS_PER_THREAD];            // Twiddled keys
-        int             ranks[ITEMS_PER_THREAD];                    // For each key, the local rank within the CTA
-        Offset          relative_bin_offsets[ITEMS_PER_THREAD];     // For each key, the global scatter base offset of the corresponding digit
-
-        // Assign max-key to all keys
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            keys[ITEM] = (DESCENDING) ? MIN_KEY : MAX_KEY;
-        }
-
-        // Load tile of keys
-        BlockLoadKeys loader(temp_storage.load_keys);
-        LoadItems(
-            loader,
-            keys,
-            d_keys_in + block_offset,
-            valid_items, 
-            Int2Type<FULL_TILE>());
-
-        __syncthreads();
-
-        // Twiddle key bits if necessary
-        #pragma unroll
-        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
-        {
-            twiddled_keys[KEY] = Traits<Key>::TwiddleIn(keys[KEY]);
-        }
-
-        // Rank the twiddled keys
-        int inclusive_digit_prefix;
-        BlockRadixRank(temp_storage.ranking).RankKeys(
-            twiddled_keys,
-            ranks,
-            current_bit,
-            num_bits,
-            inclusive_digit_prefix);
-
-        // Update global scatter base offsets for each digit
-        if ((BLOCK_THREADS == RADIX_DIGITS) || (threadIdx.x < RADIX_DIGITS))
-        {
-            int exclusive_digit_prefix;
-
-            // Get exclusive digit prefix from inclusive prefix
-            if (DESCENDING)
-            {
-                // Get the prefix from the next thread (higher bins come first)
-#if CUB_PTX_ARCH >= 300
-                exclusive_digit_prefix = ShuffleDown(inclusive_digit_prefix, 1);
-                if (threadIdx.x == RADIX_DIGITS - 1)
-                    exclusive_digit_prefix = 0;
-#else
-                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
-                exchange[threadIdx.x + 1] = 0;
-                exchange[threadIdx.x] = inclusive_digit_prefix;
-                exclusive_digit_prefix = exchange[threadIdx.x + 1];
-#endif
-            }
-            else
-            {
-                // Get the prefix from the previous thread (lower bins come first)
-#if CUB_PTX_ARCH >= 300
-                exclusive_digit_prefix = ShuffleUp(inclusive_digit_prefix, 1);
-                if (threadIdx.x == 0)
-                    exclusive_digit_prefix = 0;
-#else
-                volatile int* exchange = reinterpret_cast<int *>(temp_storage.relative_bin_offsets);
-                exchange[threadIdx.x] = 0;
-                exchange[threadIdx.x + 1] = inclusive_digit_prefix;
-                exclusive_digit_prefix = exchange[threadIdx.x];
-#endif
-            }
-
-            bin_offset -= exclusive_digit_prefix;
-            temp_storage.relative_bin_offsets[threadIdx.x] = bin_offset;
-            bin_offset += inclusive_digit_prefix;
-        }
-
-        __syncthreads();
-
-        // Scatter keys
-        ScatterKeys<FULL_TILE>(twiddled_keys, relative_bin_offsets, ranks, valid_items, Int2Type<SCATTER_ALGORITHM>());
-
-        // Gather/scatter values
-        Value values[ITEMS_PER_THREAD];
-        GatherScatterValues<FULL_TILE>(values, relative_bin_offsets, ranks, block_offset, valid_items);
-    }
-
-
-    /**
-     * Copy tiles within the range of input
-     */
-    template <
-        typename InputIterator,
-        typename T>
-    __device__ __forceinline__ void Copy(
-        InputIterator   d_in,
-        T               *d_out,
-        Offset          block_offset,
-        Offset          block_end)
-    {
-        // Simply copy the input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
-            __syncthreads();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
-
-            block_offset += TILE_ITEMS;
-        }
-
-        // Clean up last partial tile with guarded-I/O
-        if (block_offset < block_end)
-        {
-            Offset valid_items = block_end - block_offset;
-
-            T items[ITEMS_PER_THREAD];
-
-            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
-            __syncthreads();
-            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
-        }
-    }
-
-
-    /**
-     * Copy tiles within the range of input (specialized for NullType)
-     */
-    template <typename InputIterator>
-    __device__ __forceinline__ void Copy(
-        InputIterator   d_in,
-        NullType        *d_out,
-        Offset          block_offset,
-        Offset          block_end)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortDownsweep(
-        TempStorage &temp_storage,
-        Offset       bin_offset,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        bin_offset(bin_offset),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits),
-        short_circuit(false)
-    {}
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortDownsweep(
-        TempStorage &temp_storage,
-        Offset      num_items,
-        Offset      *d_spine,
-        Key         *d_keys_in,
-        Key         *d_keys_out,
-        Value       *d_values_in,
-        Value       *d_values_out,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
-        d_values_in(d_values_in),
-        d_values_out(d_values_out),
-        current_bit(current_bit),
-        num_bits(num_bits)
-    {
-        // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            int bin_idx = (DESCENDING) ?
-                RADIX_DIGITS - threadIdx.x - 1 :
-                threadIdx.x;
-
-            // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
-            Offset first_block_bin_offset = d_spine[gridDim.x * bin_idx];
-            int predicate = ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
-            this->temp_storage.short_circuit = WarpAll(predicate);
-
-            // Load my block's bin offset for my bin
-            bin_offset = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
-        }
-
-        __syncthreads();
-
-        short_circuit = this->temp_storage.short_circuit;
-    }
-
-
-    /**
-     * Distribute keys from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        Offset          block_offset,
-        const Offset    &block_end)
-    {
-        if (short_circuit)
-        {
-            // Copy keys
-            Copy(d_keys_in, d_keys_out, block_offset, block_end);
-
-            // Copy values
-            Copy(d_values_in, d_values_out, block_offset, block_end);
-        }
-        else
-        {
-            // Process full tiles of tile_items
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ProcessTile<true>(block_offset);
-                block_offset += TILE_ITEMS;
-
-                __syncthreads();
-            }
-
-            // Clean up last partial tile with guarded-I/O
-            if (block_offset < block_end)
-            {
-                ProcessTile<false>(block_offset, block_end - block_offset);
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh
deleted file mode 100644
index 284b84b51..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_radix_sort_upsweep.cuh
+++ /dev/null
@@ -1,449 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * BlockRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
- */
-
-#pragma once
-
-#include "../thread/thread_reduce.cuh"
-#include "../thread/thread_load.cuh"
-#include "../block/block_load.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRadixSortUpsweep
- */
-template <
-    int                 _BLOCK_THREADS,     ///< Threads per thread block
-    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
-    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
-    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
-struct BlockRadixSortUpsweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
-    };
-
-    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep across a range of tiles.
- */
-template <
-    typename BlockRadixSortUpsweepPolicy,   ///< Parameterized BlockRadixSortUpsweepPolicy tuning policy type
-    typename Key,                           ///< Key type
-    typename Offset>                        ///< Signed integer type for global offsets
-struct BlockRadixSortUpsweep
-{
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    typedef typename Traits<Key>::UnsignedBits UnsignedBits;
-
-    // Integer type for digit counters (to be packed into words of PackedCounters)
-    typedef unsigned char DigitCounter;
-
-    // Integer type for packing DigitCounters into columns of shared memory banks
-    typedef unsigned int PackedCounter;
-
-    static const CacheLoadModifier LOAD_MODIFIER = BlockRadixSortUpsweepPolicy::LOAD_MODIFIER;
-
-    enum
-    {
-        RADIX_BITS              = BlockRadixSortUpsweepPolicy::RADIX_BITS,
-        BLOCK_THREADS           = BlockRadixSortUpsweepPolicy::BLOCK_THREADS,
-        KEYS_PER_THREAD         = BlockRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
-
-        RADIX_DIGITS            = 1 << RADIX_BITS,
-
-        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
-        WARP_THREADS            = 1 << LOG_WARP_THREADS,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
-
-        BYTES_PER_COUNTER       = sizeof(DigitCounter),
-        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
-
-        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
-        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
-
-        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
-        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
-
-        // To prevent counter overflow, we must periodically unpack and aggregate the
-        // digit counters back into registers.  Each counter lane is assigned to a
-        // warp for aggregation.
-
-        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
-
-        // Unroll tiles in batches without risk of counter overflow
-        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
-        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
-    };
-
-
-    // Input iterator wrapper types
-    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, Offset>  KeysItr;
-
-    /**
-     * Shared memory storage layout
-     */
-    struct _TempStorage
-    {
-        union
-        {
-            DigitCounter    digit_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
-            PackedCounter   packed_counters[COUNTER_LANES][BLOCK_THREADS];
-            Offset          digit_partials[RADIX_DIGITS][WARP_THREADS + 1];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Thread fields (aggregate state bundle)
-    //---------------------------------------------------------------------
-
-    // Shared storage for this CTA
-    _TempStorage    &temp_storage;
-
-    // Thread-local counters for periodically aggregating composite-counter lanes
-    Offset          local_counts[LANES_PER_WARP][PACKING_RATIO];
-
-    // Input and output device pointers
-    KeysItr         d_keys_in;
-
-    // The least-significant bit position of the current digit to extract
-    int             current_bit;
-
-    // Number of bits in current digit
-    int             num_bits;
-
-
-
-    //---------------------------------------------------------------------
-    // Helper structure for templated iteration
-    //---------------------------------------------------------------------
-
-    // Iterate
-    template <int COUNT, int MAX>
-    struct Iterate
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(
-            BlockRadixSortUpsweep     &cta,
-            UnsignedBits                    keys[KEYS_PER_THREAD])
-        {
-            cta.Bucket(keys[COUNT]);
-
-            // Next
-            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
-        }
-    };
-
-    // Terminate
-    template <int MAX>
-    struct Iterate<MAX, MAX>
-    {
-        // BucketKeys
-        static __device__ __forceinline__ void BucketKeys(BlockRadixSortUpsweep &cta, UnsignedBits keys[KEYS_PER_THREAD]) {}
-    };
-
-
-    //---------------------------------------------------------------------
-    // Utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Decode a key and increment corresponding smem digit counter
-     */
-    __device__ __forceinline__ void Bucket(UnsignedBits key)
-    {
-        // Perform transform op
-        UnsignedBits converted_key = Traits<Key>::TwiddleIn(key);
-
-        // Extract current digit bits
-        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
-
-        // Get sub-counter offset
-        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
-
-        // Get row offset
-        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
-
-        // Increment counter
-        temp_storage.digit_counters[row_offset][threadIdx.x][sub_counter]++;
-    }
-
-
-    /**
-     * Reset composite counters
-     */
-    __device__ __forceinline__ void ResetDigitCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
-        {
-            temp_storage.packed_counters[LANE][threadIdx.x] = 0;
-        }
-    }
-
-
-    /**
-     * Reset the unpacked counters in each thread
-     */
-    __device__ __forceinline__ void ResetUnpackedCounters()
-    {
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            #pragma unroll
-            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-            {
-                local_counts[LANE][UNPACKED_COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Extracts and aggregates the digit counters for each counter lane
-     * owned by this warp
-     */
-    __device__ __forceinline__ void UnpackDigitCounts()
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            const int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                #pragma unroll
-                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
-                {
-                    #pragma unroll
-                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                    {
-                        Offset counter = temp_storage.digit_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
-                        local_counts[LANE][UNPACKED_COUNTER] += counter;
-                    }
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Places unpacked counters into smem for final digit reduction
-     */
-    __device__ __forceinline__ void ReduceUnpackedCounts(Offset &bin_count)
-    {
-        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
-        unsigned int warp_tid = threadIdx.x & (WARP_THREADS - 1);
-
-        // Place unpacked digit counters in shared memory
-        #pragma unroll
-        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
-        {
-            int counter_lane = (LANE * WARPS) + warp_id;
-            if (counter_lane < COUNTER_LANES)
-            {
-                int digit_row = counter_lane << LOG_PACKING_RATIO;
-
-                #pragma unroll
-                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
-                {
-                    temp_storage.digit_partials[digit_row + UNPACKED_COUNTER][warp_tid] =
-                        local_counts[LANE][UNPACKED_COUNTER];
-                }
-            }
-        }
-
-        __syncthreads();
-
-        // Rake-reduce bin_count reductions
-        if (threadIdx.x < RADIX_DIGITS)
-        {
-            bin_count = ThreadReduce<WARP_THREADS>(
-                temp_storage.digit_partials[threadIdx.x],
-                Sum());
-        }
-    }
-
-
-    /**
-     * Processes a single, full tile
-     */
-    __device__ __forceinline__ void ProcessFullTile(Offset block_offset)
-    {
-        // Tile of keys
-        UnsignedBits keys[KEYS_PER_THREAD];
-
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
-
-        // Prevent hoisting
-        __syncthreads();
-
-        // Bucket tile of keys
-        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
-    }
-
-
-    /**
-     * Processes a single load (may have some threads masked off)
-     */
-    __device__ __forceinline__ void ProcessPartialTile(
-        Offset block_offset,
-        const Offset &block_end)
-    {
-        // Process partial tile if necessary using single loads
-        block_offset += threadIdx.x;
-        while (block_offset < block_end)
-        {
-            // Load and bucket key
-            UnsignedBits key = d_keys_in[block_offset];
-            Bucket(key);
-            block_offset += BLOCK_THREADS;
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockRadixSortUpsweep(
-        TempStorage &temp_storage,
-        Key         *d_keys_in,
-        int         current_bit,
-        int         num_bits)
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(reinterpret_cast<UnsignedBits*>(d_keys_in)),
-        current_bit(current_bit),
-        num_bits(num_bits)
-    {}
-
-
-    /**
-     * Compute radix digit histograms from a segment of input tiles.
-     */
-    __device__ __forceinline__ void ProcessRegion(
-        Offset           block_offset,
-        const Offset     &block_end,
-        Offset           &bin_count)                ///< [out] The digit count for tid'th bin (output param, valid in the first RADIX_DIGITS threads)
-    {
-        // Reset digit counters in smem and unpacked counters in registers
-        ResetDigitCounters();
-        ResetUnpackedCounters();
-
-        // Unroll batches of full tiles
-        while (block_offset + UNROLLED_ELEMENTS <= block_end)
-        {
-            for (int i = 0; i < UNROLL_COUNT; ++i)
-            {
-                ProcessFullTile(block_offset);
-                block_offset += TILE_ITEMS;
-            }
-
-            __syncthreads();
-
-            // Aggregate back into local_count registers to prevent overflow
-            UnpackDigitCounts();
-
-            __syncthreads();
-
-            // Reset composite counters in lanes
-            ResetDigitCounters();
-        }
-
-        // Unroll single full tiles
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ProcessFullTile(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Process partial tile if necessary
-        ProcessPartialTile(
-            block_offset,
-            block_end);
-
-        __syncthreads();
-
-        // Aggregate back into local_count registers
-        UnpackDigitCounts();
-
-        __syncthreads();
-
-        // Final raking reduction of counts by bin
-        ReduceUnpackedCounts(bin_count);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh
deleted file mode 100644
index d1b89de20..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_reduce_by_key_sweep.cuh
+++ /dev/null
@@ -1,743 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceSweepByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockReduceSweepByKey
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _TWO_PHASE_SCATTER,             ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockReduceSweepByKeyPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        TWO_PHASE_SCATTER       = _TWO_PHASE_SCATTER,           ///< Whether or not to coalesce output values in shared memory before scattering them to global
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockReduceSweepByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key across a range of tiles
- */
-template <
-    typename    BlockReduceSweepByKeyPolicy,    ///< Parameterized BlockReduceSweepByKeyPolicy tuning policy type
-    typename    KeysInputIterator,               ///< Random-access input iterator type for keys
-    typename    UniqueOutputIterator,              ///< Random-access output iterator type for keys
-    typename    ValuesInputIterator,             ///< Random-access input iterator type for values
-    typename    AggregatesOutputIterator,            ///< Random-access output iterator type for values
-    typename    EqualityOp,                     ///< Key equality operator type
-    typename    ReductionOp,                    ///< Value reduction operator type
-    typename    Offset>                         ///< Signed integer type for global offsets
-struct BlockReduceSweepByKey
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of key iterator
-    typedef typename std::iterator_traits<KeysInputIterator>::value_type Key;
-
-    // Data type of value iterator
-    typedef typename std::iterator_traits<ValuesInputIterator>::value_type Value;
-
-    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
-    typedef ItemOffsetPair<Value, Offset> ReductionOffsetPair;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockReduceSweepByKeyPolicy::BLOCK_THREADS,
-        WARPS               = BLOCK_THREADS / CUB_PTX_WARP_THREADS,
-        ITEMS_PER_THREAD    = BlockReduceSweepByKeyPolicy::ITEMS_PER_THREAD,
-        TWO_PHASE_SCATTER   = (BlockReduceSweepByKeyPolicy::TWO_PHASE_SCATTER) && (ITEMS_PER_THREAD > 1),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
-        HAS_IDENTITY_ZERO       = (Equals<ReductionOp, cub::Sum>::VALUE) && (Traits<Value>::PRIMITIVE),
-
-        // Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (BlockReduceSweepByKeyPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        // Whether or not this is run-length-encoding with a constant iterator as values
-        IS_RUN_LENGTH_ENCODE    = (Equals<ValuesInputIterator, ConstantInputIterator<Value, size_t> >::VALUE) || (Equals<ValuesInputIterator, ConstantInputIterator<Value, int> >::VALUE) || (Equals<ValuesInputIterator, ConstantInputIterator<Value, unsigned int> >::VALUE),
-
-    };
-
-    // Cache-modified input iterator wrapper type for keys
-    typedef typename If<IsPointer<KeysInputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockReduceSweepByKeyPolicy::LOAD_MODIFIER, Key, Offset>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            KeysInputIterator>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedKeysInputIterator;
-
-    // Cache-modified input iterator wrapper type for values
-    typedef typename If<IsPointer<ValuesInputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockReduceSweepByKeyPolicy::LOAD_MODIFIER, Value, Offset>,  // Wrap the native input pointer with CacheModifiedValuesInputIterator
-            ValuesInputIterator>::Type                                                                // Directly use the supplied input iterator type
-        WrappedValuesInputIterator;
-
-    // Reduce-value-by-segment scan operator
-    typedef ReduceBySegmentOp<ReductionOp, ReductionOffsetPair> ReduceBySegmentOp;
-
-    // Parameterized BlockLoad type for keys
-    typedef BlockLoad<
-            WrappedKeysInputIterator,
-            BlockReduceSweepByKeyPolicy::BLOCK_THREADS,
-            BlockReduceSweepByKeyPolicy::ITEMS_PER_THREAD,
-            BlockReduceSweepByKeyPolicy::LOAD_ALGORITHM>
-        BlockLoadKeys;
-
-    // Parameterized BlockLoad type for values
-    typedef BlockLoad<
-            WrappedValuesInputIterator,
-            BlockReduceSweepByKeyPolicy::BLOCK_THREADS,
-            BlockReduceSweepByKeyPolicy::ITEMS_PER_THREAD,
-            (IS_RUN_LENGTH_ENCODE) ?
-                BLOCK_LOAD_DIRECT :
-                (BlockLoadAlgorithm) BlockReduceSweepByKeyPolicy::LOAD_ALGORITHM>
-        BlockLoadValues;
-
-    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
-    typedef BlockExchange<
-            Key,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeKeys;
-
-    // Parameterized BlockExchange type for locally compacting items as part of a two-phase scatter
-    typedef BlockExchange<
-            Value,
-            BLOCK_THREADS,
-            ITEMS_PER_THREAD>
-        BlockExchangeValues;
-
-    // Parameterized BlockDiscontinuity type for keys
-    typedef BlockDiscontinuity<Key, BLOCK_THREADS> BlockDiscontinuityKeys;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            ReductionOffsetPair,
-            BlockReduceSweepByKeyPolicy::BLOCK_THREADS,
-            BlockReduceSweepByKeyPolicy::SCAN_ALGORITHM>
-        BlockScanAllocations;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            ReductionOffsetPair,
-            ReduceBySegmentOp,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-
-        union
-        {
-            struct
-            {
-                typename BlockScanAllocations::TempStorage      scan;           // Smem needed for tile scanning
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;         // Smem needed for cooperative prefix callback
-                typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
-                typename BlockLoadKeys::TempStorage             load_keys;      // Smem needed for loading keys
-
-                Offset      tile_idx;               // Shared tile index
-                Offset      tile_num_flags_prefix;  // Exclusive tile prefix
-            };
-
-            // Smem needed for loading values
-            typename BlockLoadValues::TempStorage load_values;
-
-            // Smem needed for compacting values
-            typename BlockExchangeValues::TempStorage exchange_values;
-
-            // Smem needed for compacting keys
-            typename BlockExchangeKeys::TempStorage exchange_keys;
-        };
-
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
-
-    WrappedKeysInputIterator        d_keys_in;          ///< Input keys
-    UniqueOutputIterator            d_unique_out;       ///< Unique output keys
-
-    WrappedValuesInputIterator      d_values_in;        ///< Input values
-    AggregatesOutputIterator        d_aggregates_out;   ///< Output value aggregates
-
-    InequalityWrapper<EqualityOp>   inequality_op;      ///< Key inequality operator
-    ReduceBySegmentOp               scan_op;            ///< Reduce-value-by-flag scan operator
-    Offset                          num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockReduceSweepByKey(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        KeysInputIterator           d_keys_in,          ///< Input keys
-        UniqueOutputIterator        d_unique_out,       ///< Unique output keys
-        ValuesInputIterator         d_values_in,        ///< Input values
-        AggregatesOutputIterator    d_aggregates_out,   ///< Output value aggregates
-        EqualityOp                  equality_op,        ///< Key equality operator
-        ReductionOp                 reduction_op,       ///< Value reduction operator
-        Offset                      num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_keys_in(d_keys_in),
-        d_unique_out(d_unique_out),
-        d_values_in(d_values_in),
-        d_aggregates_out(d_aggregates_out),
-        inequality_op(equality_op),
-        scan_op(reduction_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan with identity (first tile)
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ReductionOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        ReductionOffsetPair     &block_aggregate,
-        Int2Type<true>      has_identity)
-    {
-        ReductionOffsetPair identity;
-        identity.value = 0;
-        identity.offset = 0;
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate);
-    }
-
-    /**
-     * Scan without identity (first tile).  Without an identity, the first output item is undefined.
-     *
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ReductionOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        ReductionOffsetPair     &block_aggregate,
-        Int2Type<false>     has_identity)
-    {
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate);
-    }
-
-    /**
-     * Scan with identity (subsequent tile)
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ReductionOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        ReductionOffsetPair             &block_aggregate,
-        LookbackPrefixCallbackOp    &prefix_op,
-        Int2Type<true>              has_identity)
-    {
-        ReductionOffsetPair identity;
-        identity.value = 0;
-        identity.offset = 0;
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, identity, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Scan without identity (subsequent tile).  Without an identity, the first output item is undefined.
-     */
-    __device__ __forceinline__
-    void ScanBlock(
-        ReductionOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        ReductionOffsetPair             &block_aggregate,
-        LookbackPrefixCallbackOp    &prefix_op,
-        Int2Type<false>             has_identity)
-    {
-        BlockScanAllocations(temp_storage.scan).ExclusiveScan(values_and_segments, values_and_segments, scan_op, block_aggregate, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Zip utility methods
-    //---------------------------------------------------------------------
-
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void ZipValuesAndFlags(
-        Offset          num_remaining,
-        Value           (&values)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        ReductionOffsetPair (&values_and_segments)[ITEMS_PER_THREAD])
-    {
-        // Zip values and flags
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            // Unset flags for out-of-bounds keys
-            if ((LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_remaining))
-                flags[ITEM] = 0;
-
-            values_and_segments[ITEM].value      = values[ITEM];
-            values_and_segments[ITEM].offset     = flags[ITEM];
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scatter utility methods
-    //---------------------------------------------------------------------
-
-
-
-    /**
-     * Scatter flagged items to output offsets (specialized for direct scattering)
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate. As such:
-     * - The scatter offsets must be decremented for value value aggregates
-     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
-     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
-     *
-     */
-    template <bool LAST_TILE, bool FIRST_TILE, int ITEM>
-    __device__ __forceinline__ void ScatterDirect(
-        Offset              num_remaining,
-        Key                 (&keys)[ITEMS_PER_THREAD],
-        ReductionOffsetPair     (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset              (&flags)[ITEMS_PER_THREAD],
-        Offset              tile_num_flags,
-        Int2Type<ITEM>      iteration)
-    {
-        // Scatter key
-        if (flags[ITEM])
-        {
-            d_unique_out[values_and_segments[ITEM].offset] = keys[ITEM];
-        }
-
-        bool is_first_flag     = FIRST_TILE && (ITEM == 0) && (threadIdx.x == 0);
-        bool is_oob_value      = (LAST_TILE) && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining);
-
-        // Scatter value reduction
-        if (((flags[ITEM] || is_oob_value)) && (!is_first_flag))
-        {
-            d_aggregates_out[values_and_segments[ITEM].offset - 1] = values_and_segments[ITEM].value;
-        }
-
-        ScatterDirect<LAST_TILE, FIRST_TILE>(num_remaining, keys, values_and_segments, flags, tile_num_flags, Int2Type<ITEM + 1>());
-    }
-
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        Offset                      num_remaining,
-        Key                         (&keys)[ITEMS_PER_THREAD],
-        ReductionOffsetPair             (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset                      (&flags)[ITEMS_PER_THREAD],
-        Offset                      tile_num_flags,
-        Int2Type<ITEMS_PER_THREAD>  iteration)
-    {}
-
-    /**
-     * Scatter flagged items to output offsets (specialized for two-phase scattering)
-     *
-     * The exclusive scan causes each head flag to be paired with the previous
-     * value aggregate. As such:
-     * - The scatter offsets must be decremented for value value aggregates
-     * - The first tile does not scatter the first flagged value (it is undefined from the exclusive scan)
-     * - If the tile is partially-full, we need to scatter the first out-of-bounds value (which aggregates all valid values in the last segment)
-     *
-     */
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset          num_remaining,
-        Key             (&keys)[ITEMS_PER_THREAD],
-        ReductionOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        Offset          tile_num_flags,
-        Offset          tile_num_flags_prefix)
-    {
-        int     local_ranks[ITEMS_PER_THREAD];
-        Value   values[ITEMS_PER_THREAD];
-
-        // Share exclusive tile prefix
-        if (threadIdx.x == 0)
-        {
-            temp_storage.tile_num_flags_prefix = tile_num_flags_prefix;
-        }
-
-        __syncthreads();
-
-        // Load exclusive tile prefix in all threads
-        tile_num_flags_prefix = temp_storage.tile_num_flags_prefix;
-
-        __syncthreads();
-
-        // Compute local scatter ranks
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            local_ranks[ITEM] = values_and_segments[ITEM].offset - tile_num_flags_prefix;
-        }
-
-        // Compact keys in shared memory
-        BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, local_ranks, flags);
-
-        // Scatter keys
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_unique_out + tile_num_flags_prefix, keys, tile_num_flags);
-
-        // Unzip values and set flag for first oob item in last tile
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            values[ITEM] = values_and_segments[ITEM].value;
-
-            if (FIRST_TILE)
-                local_ranks[ITEM]--;
-
-            if (LAST_TILE && (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining))
-                flags[ITEM] = 1;
-        }
-
-        // Unset first flag in first tile
-        if (FIRST_TILE && (threadIdx.x == 0))
-            flags[0] = 0;
-
-        __syncthreads();
-
-        // Compact values in shared memory
-        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, local_ranks, flags);
-
-        // Number to output
-        Offset exchange_count = tile_num_flags;
-
-        if (LAST_TILE && (num_remaining < TILE_ITEMS))
-            exchange_count++;
-
-        if (FIRST_TILE)
-        {
-            exchange_count--;
-        }
-        else
-        {
-            tile_num_flags_prefix--;
-        }
-
-        // Scatter values
-        StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_aggregates_out + tile_num_flags_prefix, values, exchange_count);
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Scatter flagged items
-     */
-    template <bool LAST_TILE, bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset          num_remaining,
-        Key             (&keys)[ITEMS_PER_THREAD],
-        ReductionOffsetPair (&values_and_segments)[ITEMS_PER_THREAD],
-        Offset          (&flags)[ITEMS_PER_THREAD],
-        Offset          tile_num_flags,
-        Offset          tile_num_flags_prefix)
-    {
-        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
-        if (TWO_PHASE_SCATTER && (tile_num_flags > BLOCK_THREADS))
-        {
-            ScatterTwoPhase<LAST_TILE, FIRST_TILE>(
-                num_remaining,
-                keys,
-                values_and_segments,
-                flags,
-                tile_num_flags,
-                tile_num_flags_prefix);
-        }
-        else
-        {
-            ScatterDirect<LAST_TILE, FIRST_TILE>(
-                num_remaining,
-                keys,
-                values_and_segments,
-                flags,
-                tile_num_flags,
-                Int2Type<0>());
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ ReductionOffsetPair ConsumeTile(
-        Offset              num_items,          ///< Total number of global input items
-        Offset              num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState       &tile_status)       ///< Global list of tile status
-    {
-        Key                 keys[ITEMS_PER_THREAD];                         // Tile keys
-        Value               values[ITEMS_PER_THREAD];                       // Tile values
-        Offset              flags[ITEMS_PER_THREAD];                        // Segment head flags
-        ReductionOffsetPair values_and_segments[ITEMS_PER_THREAD];          // Zipped values and segment flags|indices
-        ReductionOffsetPair running_total;                                  // Running count of segments and current value aggregate (including this tile)
-
-        // Load keys
-        if (LAST_TILE)
-            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys, num_remaining);
-        else
-            BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + block_offset, keys);
-
-        if (tile_idx == 0)
-        {
-            // First tile
-            __syncthreads();
-
-            // Load values
-            if (LAST_TILE)
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
-            else
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
-
-            __syncthreads();
-
-            // Set head flags.  First tile sets the first flag for the first item
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op);
-
-            // Zip values and flags
-            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
-
-            // Exclusive scan of values and flags
-            ReductionOffsetPair block_aggregate;
-            ScanBlock(values_and_segments, block_aggregate, Int2Type<HAS_IDENTITY_ZERO>());
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, block_aggregate);
-
-            // Set offset for first scan output
-            if (!HAS_IDENTITY_ZERO && (threadIdx.x == 0))
-                values_and_segments[0].offset = 0;
-
-            running_total = block_aggregate;
-
-            // Scatter flagged items
-            Scatter<LAST_TILE, true>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, 0);
-        }
-        else
-        {
-            // Not first tile
-
-            Key tile_predecessor_key = (threadIdx.x == 0) ?
-                d_keys_in[block_offset - 1] :
-                ZeroInitialize<Key>();
-
-            __syncthreads();
-
-            // Load values
-            if (LAST_TILE)
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values, num_remaining);
-            else
-                BlockLoadValues(temp_storage.load_values).Load(d_values_in + block_offset, values);
-
-            __syncthreads();
-
-            // Set head flags
-            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(flags, keys, inequality_op, tile_predecessor_key);
-
-            // Zip values and flags
-            ZipValuesAndFlags<LAST_TILE>(num_remaining, values, flags, values_and_segments);
-
-            // Exclusive scan of values and flags
-            ReductionOffsetPair block_aggregate;
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
-
-            ScanBlock(values_and_segments, block_aggregate, prefix_op, Int2Type<HAS_IDENTITY_ZERO>());
-            running_total = prefix_op.inclusive_prefix;
-
-            // Scatter flagged items
-            Scatter<LAST_TILE, false>(num_remaining, keys, values_and_segments, flags, block_aggregate.offset, prefix_op.exclusive_prefix.offset);
-        }
-
-        return running_total;
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumRunsIterator>         ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState           &tile_status,       ///< Global list of tile status
-        NumRunsIterator     d_num_runs_out)     ///< Output pointer for total number of segments identified
-    {
-#if (CUB_PTX_ARCH <= 130)
-        // Blocks are launched in increasing order, so just assign one tile per block
-
-        int     tile_idx        = (blockIdx.y * 32 * 1024) + blockIdx.x;    // Current tile index
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-        {
-            // Full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-        else if (num_remaining > 0)
-        {
-            // Last tile
-            ReductionOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Output the total number of items selected
-            if (threadIdx.x == 0)
-            {
-                *d_num_runs_out = running_total.offset;
-
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    d_aggregates_out[running_total.offset - 1] = running_total.value;
-                }
-            }
-        }
-#else
-        // Blocks may not be launched in increasing order, so work-steal tiles
-
-        // Get first tile index
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int     tile_idx        = temp_storage.tile_idx;
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;    // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;         // Remaining items (including this tile)
-
-        while (num_remaining > TILE_ITEMS)
-        {
-            // Consume full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Get tile index
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx        = temp_storage.tile_idx;
-            block_offset    = Offset(TILE_ITEMS) * tile_idx;
-            num_remaining   = num_items - block_offset;
-        }
-
-        if (num_remaining > 0)
-        {
-            // Consume last tile (treat as partially-full)
-            ReductionOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            if ((threadIdx.x == 0))
-            {
-                // Output the total number of items selected
-                *d_num_runs_out = running_total.offset;
-
-                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
-                if (num_remaining == TILE_ITEMS)
-                {
-                    d_aggregates_out[running_total.offset - 1] = running_total.value;
-                }
-            }
-        }
-#endif
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh
deleted file mode 100644
index 0f04be3b9..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_reduce_sweep.cuh
+++ /dev/null
@@ -1,430 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockReduceSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../block/block_load.cuh"
-#include "../block/block_reduce.cuh"
-#include "../grid/grid_mapping.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../grid/grid_even_share.cuh"
-#include "../util_type.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockReduceSweep
- */
-template <
-    int                     _BLOCK_THREADS,         ///< Threads per thread block
-    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
-    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
-    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,         ///< Cache load modifier for reading input elements
-    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
-struct BlockReduceSweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
-        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
-        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
-    };
-
-    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
-    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
-};
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockReduceSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction across a range of tiles.
- *
- * Each thread reduces only the values it loads. If \p FIRST_TILE, this
- * partial reduction is stored into \p thread_aggregate.  Otherwise it is
- * accumulated into \p thread_aggregate.
- */
-template <
-    typename BlockReduceSweepPolicy,        ///< Parameterized BlockReduceSweepPolicy tuning policy type
-    typename InputIterator,                 ///< Random-access iterator type for input
-    typename Offset,                        ///< Signed integer type for global offsets
-    typename ReductionOp>                   ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct BlockReduceSweep
-{
-
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // The value type of the input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Vector type of T for data movement
-    typedef typename CubVector<T, BlockReduceSweepPolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockReduceSweepPolicy::LOAD_MODIFIER, T, Offset>,  // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                            // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockReduceSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockReduceSweepPolicy::ITEMS_PER_THREAD,
-        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, BlockReduceSweepPolicy::VECTOR_LOAD_LENGTH),
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
-        CAN_VECTORIZE       = (VECTOR_LOAD_LENGTH > 1) &&
-                                (IsPointer<InputIterator>::VALUE) &&
-                                Traits<T>::PRIMITIVE,
-
-    };
-
-    static const CacheLoadModifier    LOAD_MODIFIER   = BlockReduceSweepPolicy::LOAD_MODIFIER;
-    static const BlockReduceAlgorithm BLOCK_ALGORITHM = BlockReduceSweepPolicy::BLOCK_ALGORITHM;
-
-    // Parameterized BlockReduce primitive
-    typedef BlockReduce<T, BLOCK_THREADS, BlockReduceSweepPolicy::BLOCK_ALGORITHM> BlockReduceT;
-
-    /// Shared memory type required by this thread block
-    typedef typename BlockReduceT::TempStorage _TempStorage;
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    T                       thread_aggregate;   ///< Each thread's partial reduction
-    _TempStorage&           temp_storage;       ///< Reference to temp_storage
-    InputIterator           d_in;               ///< Input data to reduce
-    WrappedInputIterator    d_wrapped_in;       ///< Wrapped input data to reduce
-    ReductionOp             reduction_op;       ///< Binary reduction operator
-    int                     first_tile_size;    ///< Size of first tile consumed
-    bool                    is_aligned;         ///< Whether or not input is vector-aligned
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-
-    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<true>  can_vectorize)
-    {
-        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
-    }
-
-    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
-    template <typename Iterator>
-    static __device__ __forceinline__ bool IsAligned(
-        Iterator        d_in,
-        Int2Type<false> can_vectorize)
-    {
-        return false;
-    }
-
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockReduceSweep(
-        TempStorage&            temp_storage,       ///< Reference to temp_storage
-        InputIterator           d_in,               ///< Input data to reduce
-        ReductionOp             reduction_op)       ///< Binary reduction operator
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_wrapped_in(d_in),
-        reduction_op(reduction_op),
-        first_tile_size(0),
-        is_aligned(IsAligned(d_in, Int2Type<CAN_VECTORIZE>()))
-    {}
-
-
-    /**
-     * Consume a full tile of input (specialized for cases where we cannot vectorize)
-     */
-    template <typename _Offset>
-    __device__ __forceinline__ T ConsumeFullTile(
-        _Offset             block_offset,            ///< The offset the tile to consume
-        Int2Type<false>     can_vectorize)           ///< Whether or not we can vectorize loads
-    {
-        T items[ITEMS_PER_THREAD];
-
-        // Load items in striped fashion
-        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
-
-        // Reduce items within each thread stripe
-        return ThreadReduce(items, reduction_op);
-    }
-
-
-    /**
-     * Consume a full tile of input (specialized for cases where we can vectorize)
-     */
-    template <typename _Offset>
-    __device__ __forceinline__ T ConsumeFullTile(
-        _Offset             block_offset,            ///< The offset the tile to consume
-        Int2Type<true>      can_vectorize)           ///< Whether or not we can vectorize loads
-    {
-        if (!is_aligned)
-        {
-            // Not aligned
-            return ConsumeFullTile(block_offset, Int2Type<false>());
-        }
-        else
-        {
-            // Alias items as an array of VectorT and load it in striped fashion
-            enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
-
-            T items[ITEMS_PER_THREAD];
-
-            VectorT *vec_items = reinterpret_cast<VectorT*>(items);
-
-            // Vector input iterator wrapper type
-            CacheModifiedInputIterator<BlockReduceSweepPolicy::LOAD_MODIFIER, VectorT, Offset> d_vec_in(
-                reinterpret_cast<VectorT*>(d_in + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH)));
-
-            #pragma unroll
-            for (int i = 0; i < WORDS; ++i)
-                vec_items[i] = d_vec_in[BLOCK_THREADS * i];
-
-            // Reduce items within each thread stripe
-            return ThreadReduce(items, reduction_op);
-        }
-    }
-
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset  block_offset,                   ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)       ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile
-            T partial = ConsumeFullTile(block_offset, Int2Type<CAN_VECTORIZE>());
-
-            // Update running thread aggregate
-            thread_aggregate = (first_tile_size) ?
-                reduction_op(thread_aggregate, partial) :       // Update
-                partial;                                        // Assign
-        }
-        else
-        {
-            // Partial tile
-            int thread_offset = threadIdx.x;
-
-            if (!first_tile_size && (thread_offset < valid_items))
-            {
-                // Assign thread_aggregate
-                thread_aggregate = d_wrapped_in[block_offset + thread_offset];
-                thread_offset += BLOCK_THREADS;
-            }
-
-            while (thread_offset < valid_items)
-            {
-                // Update thread aggregate
-                T item = d_wrapped_in[block_offset + thread_offset];
-                thread_aggregate = reduction_op(thread_aggregate, item);
-                thread_offset += BLOCK_THREADS;
-            }
-        }
-
-        // Set first tile size if necessary
-        if (!first_tile_size)
-            first_tile_size = valid_items;
-    }
-
-
-    //---------------------------------------------------------------
-    // Consume a contiguous segment of tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
-        T       &block_aggregate)                   ///< [out] Running total
-    {
-        // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ConsumeTile<true>(block_offset);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Reduce a contiguous segment of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                              num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>               &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>                   &queue,             ///< [in,out] GridQueue descriptor
-        T                                   &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   is_even_share)      ///< [in] Marker type indicating this is an even-share mapping
-    {
-        // Initialize even-share descriptor for this thread block
-        even_share.BlockInit();
-
-        // Consume input tiles
-        ConsumeRange(even_share.block_offset, even_share.block_end, block_aggregate);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<Offset>   queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        T                   &block_aggregate)   ///< [out] Running total
-    {
-        // Shared dequeue offset
-        __shared__ Offset dequeue_offset;
-
-        // We give each thread block at least one tile of input.
-        Offset block_offset = blockIdx.x * TILE_ITEMS;
-        Offset even_share_base = gridDim.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS <= num_items)
-        {
-            // Consume full tile of input
-            ConsumeTile<true>(block_offset);
-
-            // Dequeue more tiles
-            while (true)
-            {
-                 // Dequeue a tile of items
-                if (threadIdx.x == 0)
-                    dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-                __syncthreads();
-
-                // Grab tile offset and check if we're done with full tiles
-                block_offset = dequeue_offset;
-
-                __syncthreads();
-
-                if (block_offset + TILE_ITEMS > num_items)
-                    break;
-
-                // Consume a full tile
-                ConsumeTile<true>(block_offset);
-            }
-        }
-
-        if (block_offset < num_items)
-        {
-            int valid_items = num_items - block_offset;
-            ConsumeTile<false>(block_offset, valid_items);
-        }
-
-        // Compute block-wide reduction
-        block_aggregate = (first_tile_size < TILE_ITEMS) ?
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op, first_tile_size) :
-            BlockReduceT(temp_storage).Reduce(thread_aggregate, reduction_op);
-    }
-
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset                          num_items,          ///< [in] Total number of global input items
-        GridEvenShare<Offset>           &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<Offset>               &queue,             ///< [in,out] GridQueue descriptor
-        T                               &block_aggregate,   ///< [out] Running total
-        Int2Type<GRID_MAPPING_DYNAMIC>  is_dynamic)         ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        ConsumeRange(num_items, queue, block_aggregate);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh
deleted file mode 100644
index acb1f8dd1..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_rle_sweep.cuh
+++ /dev/null
@@ -1,848 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockRleSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../iterator/constant_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockRleSweep
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockRleSweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockRleSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode across a range of tiles
- */
-template <
-    typename    BlockRleSweepPolicy,      ///< Parameterized BlockRleSweepPolicy tuning policy type
-    typename    InputIterator,            ///< Random-access input iterator type for data
-    typename    OffsetsOutputIterator,    ///< Random-access output iterator type for offset values
-    typename    LengthsOutputIterator,    ///< Random-access output iterator type for length values
-    typename    EqualityOp,               ///< T equality operator type
-    typename    Offset>                   ///< Signed integer type for global offsets
-struct BlockRleSweep
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Signed integer type for run lengths
-    typedef typename std::iterator_traits<LengthsOutputIterator>::value_type Length;
-
-    // Tuple type for scanning (pairs run-length and run-index)
-    typedef ItemOffsetPair<Length, Offset> LengthOffsetPair;
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<Length, Offset> ScanTileState;
-
-    // Constants
-    enum
-    {
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
-        BLOCK_THREADS           = BlockRleSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD        = BlockRleSweepPolicy::ITEMS_PER_THREAD,
-        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (BlockRleSweepPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = BlockRleSweepPolicy::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
-    };
-
-
-    /**
-     * Special operator that signals all out-of-bounds items are not equal to everything else,
-     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
-     * trivial.
-     */
-    template <bool LAST_TILE>
-    struct OobInequalityOp
-    {
-        Offset          num_remaining;
-        EqualityOp      equality_op;
-
-        __device__ __forceinline__ OobInequalityOp(
-            Offset      num_remaining,
-            EqualityOp  equality_op)
-        :
-            num_remaining(num_remaining),
-            equality_op(equality_op)
-        {}
-
-        template <typename Index>
-        __device__ __forceinline__ bool operator()(T first, T second, Index idx)
-        {
-            if (!LAST_TILE || (idx < num_remaining))
-                return !equality_op(first, second);
-            else
-                return true;
-        }
-    };
-
-
-    // Cache-modified input iterator wrapper type for data
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockRleSweepPolicy::LOAD_MODIFIER, T, Offset>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
-            InputIterator>::Type                                                                     // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Parameterized BlockLoad type for data
-    typedef BlockLoad<
-            WrappedInputIterator,
-            BlockRleSweepPolicy::BLOCK_THREADS,
-            BlockRleSweepPolicy::ITEMS_PER_THREAD,
-            BlockRleSweepPolicy::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockDiscontinuity type for data
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized WarpScan type
-    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
-
-    // Reduce-length-by-run scan operator
-    typedef ReduceBySegmentOp<cub::Sum, LengthOffsetPair> ReduceBySegmentOp;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            LengthOffsetPair,
-            ReduceBySegmentOp,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Warp exchange types
-    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>    WarpExchangePairs;
-
-    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
-
-    typedef WarpExchange<Offset, ITEMS_PER_THREAD>              WarpExchangeOffsets;
-    typedef WarpExchange<Length, ITEMS_PER_THREAD>              WarpExchangeLengths;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            struct
-            {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                LengthOffsetPair                                warp_aggregates[WARPS];     // Smem needed for sharing warp-wide aggregates
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;                     // Smem needed for cooperative prefix callback
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage                    load;
-
-            // Smem needed for two-phase scatter
-            union
-            {
-                unsigned long long                              align;
-                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
-                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
-            };
-        };
-
-        Offset              tile_idx;                   // Shared tile index
-        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
-        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
-
-    WrappedInputIterator            d_in;               ///< Pointer to input sequence of data items
-    OffsetsOutputIterator           d_offsets_out;      ///< Input run offsets
-    LengthsOutputIterator           d_lengths_out;      ///< Output run lengths
-
-    EqualityOp                      equality_op;        ///< T equality operator
-    ReduceBySegmentOp               scan_op;            ///< Reduce-length-by-flag scan operator
-    Offset                          num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockRleSweep(
-        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
-        InputIterator               d_in,               ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIterator       d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
-        LengthsOutputIterator       d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
-        EqualityOp                  equality_op,        ///< [in] T equality operator
-        Offset                      num_items)          ///< [in] Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_offsets_out(d_offsets_out),
-        d_lengths_out(d_lengths_out),
-        equality_op(equality_op),
-        scan_op(cub::Sum()),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset              block_offset,
-        Offset              num_remaining,
-        T                   (&items)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        bool                head_flags[ITEMS_PER_THREAD];
-        bool                tail_flags[ITEMS_PER_THREAD];
-
-        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
-
-        if (FIRST_TILE && LAST_TILE)
-        {
-            // First-and-last-tile always head-flags the first item and tail-flags the last item
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, items, inequality_op);
-        }
-        else if (FIRST_TILE)
-        {
-            // First-tile always head-flags the first item
-
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[block_offset + TILE_ITEMS];
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
-                head_flags, tail_flags, tile_successor_item, items, inequality_op);
-        }
-        else if (LAST_TILE)
-        {
-            // Last-tile always flags the last item
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[block_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
-        }
-        else
-        {
-            // Get the first item from the next tile
-            T tile_successor_item;
-            if (threadIdx.x == BLOCK_THREADS - 1)
-                tile_successor_item = d_in[block_offset + TILE_ITEMS];
-
-            // Get the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[block_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeadsAndTails(
-                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
-        }
-
-        // Zip counts and runs
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            lengths_and_num_runs[ITEM].offset   = head_flags[ITEM] && (!tail_flags[ITEM]);
-            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Scan utility methods
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan of allocations
-     */
-    __device__ __forceinline__ void WarpScanAllocations(
-        LengthOffsetPair    &tile_aggregate,
-        LengthOffsetPair    &warp_aggregate,
-        LengthOffsetPair    &warp_exclusive_in_tile,
-        LengthOffsetPair    &thread_exclusive_in_warp,
-        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
-    {
-        // Perform warpscans
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        LengthOffsetPair identity;
-        identity.offset = 0;
-        identity.value = 0;
-
-        LengthOffsetPair thread_inclusive;
-        LengthOffsetPair thread_aggregate = ThreadReduce(lengths_and_num_runs, scan_op);
-        WarpScanPairs(temp_storage.warp_scan[warp_id]).Scan(
-            thread_aggregate,
-            thread_inclusive,
-            thread_exclusive_in_warp,
-            identity,
-            scan_op);
-
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = thread_inclusive;
-
-        __syncthreads();
-
-        // Accumulate total selected and the warp-wide prefix
-        warp_exclusive_in_tile          = identity;
-        warp_aggregate                  = temp_storage.warp_aggregates[warp_id];
-        tile_aggregate                  = temp_storage.warp_aggregates[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_exclusive_in_tile = tile_aggregate;
-
-            tile_aggregate = scan_op(tile_aggregate, temp_storage.warp_aggregates[WARP]);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Two-phase scatter, specialized for warp time-slicing
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset              tile_num_runs_exclusive_in_global,
-        Offset              warp_num_runs_aggregate,
-        Offset              warp_num_runs_exclusive_in_tile,
-        Offset              (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<true>      is_warp_time_slice)
-    {
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Locally compact items within the warp (first warp)
-        if (warp_id == 0)
-        {
-            WarpExchangePairs(temp_storage.exchange_pairs[0]).ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-        }
-
-        // Locally compact items within the warp (remaining warps)
-        #pragma unroll
-        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                WarpExchangePairs(temp_storage.exchange_pairs[0]).ScatterToStriped(lengths_and_offsets, thread_num_runs_exclusive_in_warp);
-            }
-        }
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
-            {
-                Offset item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].offset;
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Two-phase scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset              tile_num_runs_exclusive_in_global,
-        Offset              warp_num_runs_aggregate,
-        Offset              warp_num_runs_exclusive_in_tile,
-        Offset              (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
-        Int2Type<false>     is_warp_time_slice)
-    {
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Unzip
-        Offset run_offsets[ITEMS_PER_THREAD];
-        Length run_lengths[ITEMS_PER_THREAD];
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            run_offsets[ITEM] = lengths_and_offsets[ITEM].offset;
-            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
-        }
-
-        WarpExchangeOffsets(temp_storage.exchange_offsets[warp_id]).ScatterToStriped(run_offsets, thread_num_runs_exclusive_in_warp);
-
-        if (sizeof(Length) == sizeof(Offset))
-            __threadfence_block();
-        else
-            __syncthreads();
-
-        WarpExchangeLengths(temp_storage.exchange_lengths[warp_id]).ScatterToStriped(run_lengths, thread_num_runs_exclusive_in_warp);
-
-        // Global scatter
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
-            {
-                Offset item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    (ITEM * WARP_THREADS) + lane_id;
-
-                // Scatter offset
-                d_offsets_out[item_offset] = run_offsets[ITEM];
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Direct scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void ScatterDirect(
-        Offset              tile_num_runs_exclusive_in_global,
-        Offset              warp_num_runs_aggregate,
-        Offset              warp_num_runs_exclusive_in_tile,
-        Offset              (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
-            {
-                Offset item_offset =
-                    tile_num_runs_exclusive_in_global +
-                    warp_num_runs_exclusive_in_tile +
-                    thread_num_runs_exclusive_in_warp[ITEM];
-
-                // Scatter offset
-                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].offset;
-
-                // Scatter length if not the first (global) length
-                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
-                {
-                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
-                }
-            }
-        }
-    }
-
-
-    /**
-     * Scatter
-     */
-    template <bool FIRST_TILE>
-    __device__ __forceinline__ void Scatter(
-        Offset              tile_num_runs_aggregate,
-        Offset              tile_num_runs_exclusive_in_global,
-        Offset              warp_num_runs_aggregate,
-        Offset              warp_num_runs_exclusive_in_tile,
-        Offset              (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
-        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
-    {
-        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
-        {
-            // Direct scatter if the warp has any items
-            if (warp_num_runs_aggregate)
-            {
-                ScatterDirect<FIRST_TILE>(
-                    tile_num_runs_exclusive_in_global,
-                    warp_num_runs_aggregate,
-                    warp_num_runs_exclusive_in_tile,
-                    thread_num_runs_exclusive_in_warp,
-                    lengths_and_offsets);
-            }
-        }
-        else
-        {
-            // Scatter two phase
-            ScatterTwoPhase<FIRST_TILE>(
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets,
-                Int2Type<STORE_WARP_TIME_SLICING>());
-        }
-
-    }
-
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <
-        bool                LAST_TILE>
-    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
-        Offset              num_items,          ///< Total number of global input items
-        Offset              num_remaining,      ///< Number of global input items remaining (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState       &tile_status)       ///< Global list of tile status
-    {
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining, ZeroInitialize<T>());
-            else
-                BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<true, LAST_TILE>(
-                block_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // Update tile status if this is not the last tile
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_aggregate);
-
-            // Update thread_exclusive_in_warp to fold in warp run-length
-            if (thread_exclusive_in_warp.offset == 0)
-                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
-
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            Offset              thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-
-            // Downsweep scan through lengths_and_num_runs
-            ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].offset        = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].offset) ?
-                                                                lengths_and_num_runs2[ITEM].offset :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            Offset tile_num_runs_aggregate              = tile_aggregate.offset;
-            Offset tile_num_runs_exclusive_in_global    = 0;
-            Offset warp_num_runs_aggregate              = warp_aggregate.offset;
-            Offset warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.offset;
-
-            // Scatter
-            Scatter<true>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return tile_aggregate;
-        }
-        else
-        {
-            // Not first tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-                BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining, ZeroInitialize<T>());
-            else
-                BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Set flags
-            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
-
-            InitializeSelections<false, LAST_TILE>(
-                block_offset,
-                num_remaining,
-                items,
-                lengths_and_num_runs);
-
-            // Exclusive scan of lengths and runs
-            LengthOffsetPair tile_aggregate;
-            LengthOffsetPair warp_aggregate;
-            LengthOffsetPair warp_exclusive_in_tile;
-            LengthOffsetPair thread_exclusive_in_warp;
-
-            WarpScanAllocations(
-                tile_aggregate,
-                warp_aggregate,
-                warp_exclusive_in_tile,
-                thread_exclusive_in_warp,
-                lengths_and_num_runs);
-
-            // First warp computes tile prefix in lane 0
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx);
-            int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-            if (warp_id == 0)
-            {
-                prefix_op(tile_aggregate);
-                if (threadIdx.x == 0)
-                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
-            }
-
-            __syncthreads();
-
-            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
-
-            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
-            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
-            if (thread_exclusive_in_warp.offset == 0)
-                thread_exclusive_in_warp.value += thread_exclusive.value;
-
-            // Downsweep scan through lengths_and_num_runs
-            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
-            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
-            Offset              thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
-
-            ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
-
-            // Zip
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
-                lengths_and_offsets[ITEM].offset        = block_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
-                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].offset) ?
-                                                                lengths_and_num_runs2[ITEM].offset :         // keep
-                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
-            }
-
-            Offset tile_num_runs_aggregate              = tile_aggregate.offset;
-            Offset tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.offset;
-            Offset warp_num_runs_aggregate              = warp_aggregate.offset;
-            Offset warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.offset;
-
-            // Scatter
-            Scatter<false>(
-                tile_num_runs_aggregate,
-                tile_num_runs_exclusive_in_global,
-                warp_num_runs_aggregate,
-                warp_num_runs_exclusive_in_tile,
-                thread_num_runs_exclusive_in_warp,
-                lengths_and_offsets);
-
-            // Return running total (inclusive of this tile)
-            return prefix_op.inclusive_prefix;
-        }
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumRunsIterator>         ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_tiles,          ///< Total number of input tiles
-        GridQueue<int>      queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState       &tile_status,       ///< Global list of tile status
-        NumRunsIterator     d_num_runs_out)         ///< Output pointer for total number of runs identified
-    {
-
-#if __CUDA_ARCH__ > 130
-
-        // Blocks may not be launched in increasing order, so work-steal tiles
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int tile_idx = temp_storage.tile_idx;
-
-#else
-
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
-
-#endif
-
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (tile_idx < num_tiles - 1)
-        {
-            // Full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-        else
-        {
-            // Last tile
-            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            if (threadIdx.x == 0)
-            {
-                // Output the total number of items selected
-                *d_num_runs_out = running_total.offset;
-
-                // The inclusive prefix contains accumulated length reduction for the last run
-                d_lengths_out[running_total.offset - 1] = running_total.value;
-            }
-        }
-
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh
deleted file mode 100644
index ad3ab9d2f..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_scan_prefix_operators.cuh
+++ /dev/null
@@ -1,759 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Callback operator types for supplying BlockScan prefixes
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../warp/warp_reduce.cuh"
-#include "../util_arch.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Prefix functor type for maintaining a running prefix while scanning a
- * region independent of other thread blocks
- ******************************************************************************/
-
-/**
- * Stateful callback operator type for supplying BlockScan prefixes.
- * Maintains a running prefix that can be applied to consecutive
- * BlockScan operations.
- */
-template <
-    typename T,                 ///< BlockScan value type
-    typename ScanOp>            ///< Wrapped scan operator type
-struct BlockScanRunningPrefixOp
-{
-    ScanOp  op;                 ///< Wrapped scan operator
-    T       running_total;      ///< Running block-wide prefix
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOp op)
-    :
-        op(op)
-    {}
-
-    /// Constructor
-    __device__ __forceinline__ BlockScanRunningPrefixOp(
-        T starting_prefix,
-        ScanOp op)
-    :
-        op(op),
-        running_total(starting_prefix)
-    {}
-
-    /**
-     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
-     */
-    __device__ __forceinline__ T operator()(
-        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
-    {
-        T retval = running_total;
-        running_total = op(running_total, block_aggregate);
-        return retval;
-    }
-};
-
-
-/******************************************************************************
- * Generic tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Enumerations of tile status
- */
-enum ScanTileStatus
-{
-    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
-    SCAN_TILE_INVALID,      // Not yet processed
-    SCAN_TILE_PARTIAL,      // Tile aggregate is available
-    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
-};
-
-
-/**
- * Tile status interface.
- */
-template <
-    typename    T,
-    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
-struct ScanTileState;
-
-
-/**
- * Tile status interface specialized for scan status and value types
- * that can be combined into one machine word that can be
- * read/written coherently in a single access.
- */
-template <typename T>
-struct ScanTileState<T, true>
-{
-    // Status word type
-    typedef typename If<(sizeof(T) == 8),
-        long long,
-        typename If<(sizeof(T) == 4),
-            int,
-            typename If<(sizeof(T) == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-
-    // Unit word type
-    typedef typename If<(sizeof(T) == 8),
-        longlong2,
-        typename If<(sizeof(T) == 4),
-            int2,
-            typename If<(sizeof(T) == 2),
-                int,
-                uchar2>::Type>::Type>::Type TxnWord;
-
-
-    // Device word type
-    struct TileDescriptor
-    {
-        StatusWord  status;
-        T           value;
-    };
-
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-
-    // Device storage
-    TileDescriptor *d_tile_status;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        // Use warp-any to determine when all threads have valid status
-        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        while ((tile_descriptor.status == SCAN_TILE_INVALID))
-        {
-            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-        }
-
-        status = tile_descriptor.status;
-        value = tile_descriptor.value;
-    }
-
-};
-
-
-
-/**
- * Tile status interface specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <typename T>
-struct ScanTileState<T, false>
-{
-    // Status word type
-    typedef char StatusWord;
-
-    // Constants
-    enum
-    {
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Device storage
-    StatusWord  *d_tile_status;
-    T           *d_tile_partial;
-    T           *d_tile_inclusive;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ScanTileState()
-    :
-        d_tile_status(NULL),
-        d_tile_partial(NULL),
-        d_tile_inclusive(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        cudaError_t error = cudaSuccess;
-        do
-        {
-            void*   allocations[3];
-            size_t  allocation_sizes[3];
-
-            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
-            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
-            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
-
-            // Compute allocation pointers into the single storage blob
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Alias the offsets
-            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
-            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
-            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
-        }
-        while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        // Specify storage allocation requirements
-        size_t  allocation_sizes[3];
-        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
-        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
-        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
-
-        // Set the necessary size of the blob
-        void* allocations[3];
-        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
-    {
-        // Update tile inclusive value
-        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
-    {
-        // Update tile partial value
-        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
-
-        // Fence
-        __threadfence();
-
-        // Update tile status
-        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        T               &value)
-    {
-        status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-        while (status == SCAN_TILE_INVALID)
-        {
-            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
-        }
-
-        T partial = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
-        T inclusive = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
-
-        value = (status == StatusWord(SCAN_TILE_PARTIAL)) ?
-            partial :
-            inclusive;
-
-    }
-};
-
-
-/******************************************************************************
- * ReduceByKey tile status interface types for block-cooperative scans
- ******************************************************************************/
-
-/**
- * Tile status interface for reduction by key.
- *
- */
-template <
-    typename    Value,
-    typename    Offset,
-    bool        SINGLE_WORD = (Traits<Value>::PRIMITIVE) && (sizeof(Value) + sizeof(Offset) < 16)>
-struct ReduceByKeyScanTileState;
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * cannot be combined into one machine word.
- */
-template <
-    typename    Value,
-    typename    Offset>
-struct ReduceByKeyScanTileState<Value, Offset, false> :
-    ScanTileState<ItemOffsetPair<Value, Offset> >
-{
-    typedef ScanTileState<ItemOffsetPair<Value, Offset> > SuperClass;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState() : SuperClass() {}
-};
-
-
-/**
- * Tile status interface for reduction by key, specialized for scan status and value types that
- * can be combined into one machine word that can be read/written coherently in a single access.
- */
-template <
-    typename Value,
-    typename Offset>
-struct ReduceByKeyScanTileState<Value, Offset, true>
-{
-    typedef ItemOffsetPair<Value, Offset> ReductionOffsetPair;
-
-    // Constants
-    enum
-    {
-        PAIR_SIZE           = sizeof(Value) + sizeof(Offset),
-        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
-        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
-
-        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
-    };
-
-    // Status word type
-    typedef typename If<(STATUS_WORD_SIZE == 8),
-        long long,
-        typename If<(STATUS_WORD_SIZE == 4),
-            int,
-            typename If<(STATUS_WORD_SIZE == 2),
-                short,
-                char>::Type>::Type>::Type StatusWord;
-
-    // Status word type
-    typedef typename If<(TXN_WORD_SIZE == 16),
-        longlong2,
-        typename If<(TXN_WORD_SIZE == 8),
-            long long,
-            int>::Type>::Type TxnWord;
-
-    // Device word type (for when sizeof(Value) == sizeof(Offset))
-    struct TileDescriptorBigStatus
-    {
-        Offset      offset;
-        Value       value;
-        StatusWord  status;
-    };
-
-    // Device word type (for when sizeof(Value) != sizeof(Offset))
-    struct TileDescriptorLittleStatus
-    {
-        Value       value;
-        StatusWord  status;
-        Offset      offset;
-    };
-
-    // Device word type
-    typedef typename If<
-            (sizeof(Value) == sizeof(Offset)),
-            TileDescriptorBigStatus,
-            TileDescriptorLittleStatus>::Type
-        TileDescriptor;
-
-
-    // Device storage
-    TileDescriptor *d_tile_status;
-
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    ReduceByKeyScanTileState()
-    :
-        d_tile_status(NULL)
-    {}
-
-
-    /// Initializer
-    __host__ __device__ __forceinline__
-    cudaError_t Init(
-        int     num_tiles,                          ///< [in] Number of tiles
-        void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
-    {
-        d_tile_status = reinterpret_cast<TileDescriptor*>(d_temp_storage);
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Compute device memory needed for tile status
-     */
-    __host__ __device__ __forceinline__
-    static cudaError_t AllocationSize(
-        int     num_tiles,                          ///< [in] Number of tiles
-        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
-    {
-        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
-        return cudaSuccess;
-    }
-
-
-    /**
-     * Initialize (from device)
-     */
-    __device__ __forceinline__ void InitializeStatus(int num_tiles)
-    {
-        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
-        if (tile_idx < num_tiles)
-        {
-            // Not-yet-set
-            d_tile_status[TILE_STATUS_PADDING + tile_idx].status = StatusWord(SCAN_TILE_INVALID);
-        }
-
-        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
-        {
-            // Padding
-            d_tile_status[threadIdx.x].status = StatusWord(SCAN_TILE_OOB);
-        }
-    }
-
-
-    /**
-     * Update the specified tile's inclusive value and corresponding status
-     */
-    __device__ __forceinline__ void SetInclusive(int tile_idx, ReductionOffsetPair tile_inclusive)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
-        tile_descriptor.value = tile_inclusive.value;
-        tile_descriptor.offset = tile_inclusive.offset;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-
-    /**
-     * Update the specified tile's partial value and corresponding status
-     */
-    __device__ __forceinline__ void SetPartial(int tile_idx, ReductionOffsetPair tile_partial)
-    {
-        TileDescriptor tile_descriptor;
-        tile_descriptor.status = SCAN_TILE_PARTIAL;
-        tile_descriptor.value = tile_partial.value;
-        tile_descriptor.offset = tile_partial.offset;
-
-        TxnWord alias;
-        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
-        ThreadStore<STORE_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx), alias);
-    }
-
-    /**
-     * Wait for the corresponding tile to become non-invalid
-     */
-    __device__ __forceinline__ void WaitForValid(
-        int             tile_idx,
-        StatusWord      &status,
-        ReductionOffsetPair  &value)
-    {
-        // Use warp-any to determine when all threads have valid status
-        TxnWord alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-        TileDescriptor tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-
-        while (WarpAny(tile_descriptor.status == SCAN_TILE_INVALID))
-        {
-            alias = ThreadLoad<LOAD_CG>(reinterpret_cast<TxnWord*>(d_tile_status + TILE_STATUS_PADDING + tile_idx));
-            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
-        }
-
-        status = tile_descriptor.status;
-        value.value = tile_descriptor.value;
-        value.offset = tile_descriptor.offset;
-    }
-
-};
-
-
-/******************************************************************************
- * Prefix call-back operator for coupling local block scan within a
- * block-cooperative scan
- ******************************************************************************/
-
-/**
- * Stateful block-scan prefix functor.  Provides the the running prefix for
- * the current tile by using the call-back warp to wait on on
- * aggregates/prefixes from predecessor tiles to become available.
- */
-template <
-    typename T,
-    typename ScanOp,
-    typename ScanTileState>
-struct BlockScanLookbackPrefixOp
-{
-    // Parameterized warp reduce
-    typedef WarpReduce<T> WarpReduceT;
-
-    // Temporary storage type
-    typedef typename WarpReduceT::TempStorage _TempStorage;
-
-    // Alias wrapper allowing temporary storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-    // Type of status word
-    typedef typename ScanTileState::StatusWord StatusWord;
-
-    // Fields
-    ScanTileState               &tile_status;       ///< Interface to tile status
-    _TempStorage                &temp_storage;      ///< Reference to a warp-reduction instance
-    ScanOp                      scan_op;            ///< Binary scan operator
-    int                         tile_idx;           ///< The current tile index
-    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
-    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
-
-    // Constructor
-    __device__ __forceinline__
-    BlockScanLookbackPrefixOp(
-        ScanTileState       &tile_status,
-        TempStorage         &temp_storage,
-        ScanOp              scan_op,
-        int                 tile_idx)
-    :
-        tile_status(tile_status),
-        temp_storage(temp_storage.Alias()),
-        scan_op(scan_op),
-        tile_idx(tile_idx) {}
-
-
-    // Block until all predecessors within the warp-wide window have non-invalid status
-    __device__ __forceinline__
-    void ProcessWindow(
-        int         predecessor_idx,        ///< Preceding tile index to inspect
-        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
-        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
-    {
-        T value;
-        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
-
-        // Perform a segmented reduction to get the prefix for the current window.
-        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
-
-        window_aggregate = WarpReduceT(temp_storage).TailSegmentedReduce(
-            value,
-            tail_flag,
-            scan_op);
-    }
-
-
-    // BlockScan prefix callback functor (called by the first warp)
-    __device__ __forceinline__
-    T operator()(T block_aggregate)
-    {
-        // Update our status with our tile-aggregate
-        if (threadIdx.x == 0)
-        {
-            tile_status.SetPartial(tile_idx, block_aggregate);
-        }
-
-        int         predecessor_idx = tile_idx - threadIdx.x - 1;
-        StatusWord  predecessor_status;
-        T           window_aggregate;
-
-        // Wait for the warp-wide window of predecessor tiles to become valid
-        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-
-        // The exclusive tile prefix starts out as the current window aggregate
-        exclusive_prefix = window_aggregate;
-
-        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
-        while (WarpAll(predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)))
-        {
-            predecessor_idx -= CUB_PTX_WARP_THREADS;
-
-            // Update exclusive tile prefix with the window prefix
-            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
-            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
-        }
-
-        // Compute the inclusive tile prefix and update the status for this tile
-        if (threadIdx.x == 0)
-        {
-            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
-            tile_status.SetInclusive(tile_idx, inclusive_prefix);
-        }
-
-        // Return exclusive_prefix
-        return exclusive_prefix;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh
deleted file mode 100644
index 8c6cf35c4..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_scan_sweep.cuh
+++ /dev/null
@@ -1,544 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockScanSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockScanSweep
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    bool                        _LOAD_WARP_TIME_SLICING,        ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockScanSweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        LOAD_WARP_TIME_SLICING  = _LOAD_WARP_TIME_SLICING,      ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
-    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
-    static const BlockScanAlgorithm     SCAN_ALGORITHM    = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockScanSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan across a range of tiles.
- */
-template <
-    typename BlockScanSweepPolicy,      ///< Parameterized BlockScanSweepPolicy tuning policy type
-    typename InputIterator,             ///< Random-access input iterator type
-    typename OutputIterator,            ///< Random-access output iterator type
-    typename ScanOp,                    ///< Scan functor type
-    typename Identity,                  ///< Identity element type (cub::NullType for inclusive scan)
-    typename Offset>                    ///< Signed integer type for global offsets
-struct BlockScanSweep
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<T> ScanTileState;
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockScanSweepPolicy::LOAD_MODIFIER, T, Offset>,    // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                            // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Constants
-    enum
-    {
-        INCLUSIVE           = Equals<Identity, NullType>::VALUE,            // Inclusive scan if no identity type is provided
-        BLOCK_THREADS       = BlockScanSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockScanSweepPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        // Whether or not to sync after loading data
-        SYNC_AFTER_LOAD     = (BlockScanSweepPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-    };
-
-    // Parameterized BlockLoad type
-    typedef BlockLoad<
-            WrappedInputIterator,
-            BlockScanSweepPolicy::BLOCK_THREADS,
-            BlockScanSweepPolicy::ITEMS_PER_THREAD,
-            BlockScanSweepPolicy::LOAD_ALGORITHM,
-            BlockScanSweepPolicy::LOAD_WARP_TIME_SLICING>
-        BlockLoadT;
-
-    // Parameterized BlockStore type
-    typedef BlockStore<
-            OutputIterator,
-            BlockScanSweepPolicy::BLOCK_THREADS,
-            BlockScanSweepPolicy::ITEMS_PER_THREAD,
-            BlockScanSweepPolicy::STORE_ALGORITHM,
-            BlockScanSweepPolicy::STORE_WARP_TIME_SLICING>
-        BlockStoreT;
-
-    // Parameterized BlockScan type
-    typedef BlockScan<
-            T,
-            BlockScanSweepPolicy::BLOCK_THREADS,
-            BlockScanSweepPolicy::SCAN_ALGORITHM>
-        BlockScanT;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            T,
-            ScanOp,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
-    typedef BlockScanRunningPrefixOp<
-            T,
-            ScanOp>
-        RunningPrefixCallbackOp;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
-            typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
-            struct
-            {
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;     // Smem needed for cooperative prefix callback
-                typename BlockScanT::TempStorage                scan;       // Smem needed for tile scanning
-            };
-        };
-
-        Offset tile_idx;   // Shared tile index
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                &temp_storage;      ///< Reference to temp_storage
-    WrappedInputIterator        d_in;               ///< Input data
-    OutputIterator              d_out;              ///< Output data
-    ScanOp                      scan_op;            ///< Binary scan operator
-    Identity                    identity;           ///< Identity element
-
-
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (first tile)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization
-     */
-    template <typename _ScanOp, typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate);
-    }
-
-    /**
-     * Exclusive sum specialization
-     */
-    template <typename _Identity>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate);
-    }
-
-    /**
-     * Inclusive scan specialization
-     */
-    template <typename _ScanOp>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
-    }
-
-    /**
-     * Inclusive sum specialization
-     */
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate);
-    }
-
-    //---------------------------------------------------------------------
-    // Block scan utility methods (subsequent tiles)
-    //---------------------------------------------------------------------
-
-    /**
-     * Exclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, identity, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Exclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename _Identity, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, _Identity identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).ExclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive scan specialization (with prefix from predecessors)
-     */
-    template <typename _ScanOp, typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], _ScanOp scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate, prefix_op);
-    }
-
-    /**
-     * Inclusive sum specialization (with prefix from predecessors)
-     */
-    template <typename PrefixCallback>
-    __device__ __forceinline__
-    void ScanBlock(T (&items)[ITEMS_PER_THREAD], Sum scan_op, NullType identity, T& block_aggregate, PrefixCallback &prefix_op)
-    {
-        BlockScanT(temp_storage.scan).InclusiveSum(items, items, block_aggregate, prefix_op);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockScanSweep(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIterator               d_in,               ///< Input data
-        OutputIterator              d_out,              ///< Output data
-        ScanOp                      scan_op,            ///< Binary scan operator
-        Identity                    identity)           ///< Identity element
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out(d_out),
-        scan_op(scan_op),
-        identity(identity)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset              num_items,          ///< Total number of input items
-        Offset              num_remaining,      ///< Total number of items remaining to be processed (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState       &tile_status)       ///< Global list of tile status
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (LAST_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, num_remaining);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-
-        if (SYNC_AFTER_LOAD)
-            __syncthreads();
-
-        // Perform tile scan
-        if (tile_idx == 0)
-        {
-            // Scan first tile
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate);
-
-            // Update tile status if there may be successor tiles (i.e., this tile is full)
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, block_aggregate);
-        }
-        else
-        {
-            // Scan non-first tile
-            T block_aggregate;
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, scan_op, tile_idx);
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (LAST_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, num_remaining);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic chained scan
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        int                 num_items,          ///< Total number of input items
-        GridQueue<int>      queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState       &tile_status)       ///< Global list of tile status
-    {
-#if (CUB_PTX_ARCH <= 130)
-        // Blocks are launched in increasing order, so just assign one tile per block
-
-        int     tile_idx        = (blockIdx.y * gridDim.x) + blockIdx.x;    // Current tile index
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (num_remaining > TILE_ITEMS)
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        else if (num_remaining > 0)
-            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-#else
-        // Blocks may not be launched in increasing order, so work-steal tiles
-
-        // Get first tile index
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int     tile_idx        = temp_storage.tile_idx;
-        Offset  block_offset    = TILE_ITEMS * tile_idx;
-        Offset  num_remaining   = num_items - block_offset;
-
-        while (num_remaining >= TILE_ITEMS)
-        {
-            // Consume full tile
-            ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-            // Get next tile
-            if (threadIdx.x == 0)
-                temp_storage.tile_idx = queue.Drain(1);
-
-            __syncthreads();
-
-            tile_idx        = temp_storage.tile_idx;
-            block_offset    = TILE_ITEMS * tile_idx;
-            num_remaining   = num_items - block_offset;
-        }
-
-        // Consume the last (and potentially partially-full) tile
-        if (num_remaining > 0)
-        {
-            ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-        }
-
-#endif
-
-    }
-
-
-    //---------------------------------------------------------------------
-    // Scan an sequence of consecutive tiles (independent of other thread blocks)
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input
-     */
-    template <
-        bool                FULL_TILE,
-        bool                FIRST_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset                      block_offset,               ///< Tile offset
-        RunningPrefixCallbackOp     &prefix_op,                 ///< Running prefix operator
-        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
-    {
-        // Load items
-        T items[ITEMS_PER_THREAD];
-
-        if (FULL_TILE)
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items);
-        else
-            BlockLoadT(temp_storage.load).Load(d_in + block_offset, items, valid_items);
-
-        __syncthreads();
-
-        // Block scan
-        if (FIRST_TILE)
-        {
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate);
-            prefix_op.running_total = block_aggregate;
-        }
-        else
-        {
-            T block_aggregate;
-            ScanBlock(items, scan_op, identity, block_aggregate, prefix_op);
-        }
-
-        __syncthreads();
-
-        // Store items
-        if (FULL_TILE)
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items);
-        else
-            BlockStoreT(temp_storage.store).Store(d_out + block_offset, items, valid_items);
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset   block_offset,      ///< [in] Threadblock begin offset (inclusive)
-        Offset   block_end)         ///< [in] Threadblock end offset (exclusive)
-    {
-        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(scan_op);
-
-        if (block_offset + TILE_ITEMS <= block_end)
-        {
-            // Consume first tile of input (full)
-            ConsumeTile<true, true>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-
-            // Consume subsequent full tiles of input
-            while (block_offset + TILE_ITEMS <= block_end)
-            {
-                ConsumeTile<true, false>(block_offset, prefix_op);
-                block_offset += TILE_ITEMS;
-            }
-
-            // Consume a partially-full tile
-            if (block_offset < block_end)
-            {
-                int valid_items = block_end - block_offset;
-                ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-            }
-        }
-        else
-        {
-            // Consume the first tile of input (partially-full)
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false, true>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-
-    /**
-     * Scan a consecutive share of input tiles, seeded with the specified prefix value
-     */
-    __device__ __forceinline__ void ConsumeRange(
-        Offset  block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        Offset  block_end,                          ///< [in] Threadblock end offset (exclusive)
-        T       prefix)                             ///< [in] The prefix to apply to the scan segment
-    {
-        BlockScanRunningPrefixOp<T, ScanOp> prefix_op(prefix, scan_op);
-
-        // Consume full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
-        {
-            ConsumeTile<true, false>(block_offset, prefix_op);
-            block_offset += TILE_ITEMS;
-        }
-
-        // Consume a partially-full tile
-        if (block_offset < block_end)
-        {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false, false>(block_offset, prefix_op, valid_items);
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh
deleted file mode 100644
index 9c361a2f0..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/block_select_sweep.cuh
+++ /dev/null
@@ -1,718 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockSelectSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "block_scan_prefix_operators.cuh"
-#include "../block/block_load.cuh"
-#include "../block/block_store.cuh"
-#include "../block/block_scan.cuh"
-#include "../block/block_exchange.cuh"
-#include "../block/block_discontinuity.cuh"
-#include "../grid/grid_queue.cuh"
-#include "../iterator/cache_modified_input_iterator.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Tuning policy types
- ******************************************************************************/
-
-/**
- * Parameterizable tuning policy type for BlockSelectSweep
- */
-template <
-    int                         _BLOCK_THREADS,                 ///< Threads per thread block
-    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
-    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
-    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
-    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
-struct BlockSelectSweepPolicy
-{
-    enum
-    {
-        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
-        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
-        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-    };
-
-    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
-    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
-};
-
-
-
-
-/******************************************************************************
- * Thread block abstractions
- ******************************************************************************/
-
-/**
- * \brief BlockSelectSweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection across a range of tiles
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename    BlockSelectSweepPolicy,         ///< Parameterized BlockSelectSweepPolicy tuning policy type
-    typename    InputIterator,                  ///< Random-access input iterator type for selection items
-    typename    FlagsInputIterator,                   ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIterator,                 ///< Random-access input iterator type for selected items
-    typename    SelectOp,                       ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
-    typename    EqualityOp,                     ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
-    typename    Offset,                         ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct BlockSelectSweep
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Data type of flag iterator
-    typedef typename std::iterator_traits<FlagsInputIterator>::value_type Flag;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<Offset> ScanTileState;
-
-    // Constants
-    enum
-    {
-        USE_SELECT_OP,
-        USE_SELECT_FLAGS,
-        USE_DISCONTINUITY,
-
-        BLOCK_THREADS           = BlockSelectSweepPolicy::BLOCK_THREADS,
-
-        /// Number of warp threads
-        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
-
-        /// Number of active warps
-        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
-
-        ITEMS_PER_THREAD        = BlockSelectSweepPolicy::ITEMS_PER_THREAD,
-        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
-
-        /// Whether or not to sync after loading data
-        SYNC_AFTER_LOAD         = (BlockSelectSweepPolicy::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
-
-        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
-        STORE_WARP_TIME_SLICING = BlockSelectSweepPolicy::STORE_WARP_TIME_SLICING,
-        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
-
-        SELECT_METHOD           = (!Equals<SelectOp, NullType>::VALUE) ?
-                                    USE_SELECT_OP :
-                                    (!Equals<Flag, NullType>::VALUE) ?
-                                        USE_SELECT_FLAGS :
-                                        USE_DISCONTINUITY
-    };
-
-    // Input iterator wrapper type
-    typedef typename If<IsPointer<InputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockSelectSweepPolicy::LOAD_MODIFIER, T, Offset>,      // Wrap the native input pointer with CacheModifiedInputIterator
-            InputIterator>::Type                                                                // Directly use the supplied input iterator type
-        WrappedInputIterator;
-
-    // Flag iterator wrapper type
-    typedef typename If<IsPointer<FlagsInputIterator>::VALUE,
-            CacheModifiedInputIterator<BlockSelectSweepPolicy::LOAD_MODIFIER, Flag, Offset>,   // Wrap the native input pointer with CacheModifiedInputIterator
-            FlagsInputIterator>::Type                                                                 // Directly use the supplied input iterator type
-        WrappedFlagsInputIterator;
-
-    // Parameterized BlockLoad type for input items
-    typedef BlockLoad<
-            WrappedInputIterator,
-            BlockSelectSweepPolicy::BLOCK_THREADS,
-            BlockSelectSweepPolicy::ITEMS_PER_THREAD,
-            BlockSelectSweepPolicy::LOAD_ALGORITHM>
-        BlockLoadT;
-
-    // Parameterized BlockLoad type for flags
-    typedef BlockLoad<
-            WrappedFlagsInputIterator,
-            BlockSelectSweepPolicy::BLOCK_THREADS,
-            BlockSelectSweepPolicy::ITEMS_PER_THREAD,
-            BlockSelectSweepPolicy::LOAD_ALGORITHM>
-        BlockLoadFlags;
-
-    // Parameterized BlockDiscontinuity type for input items
-    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
-
-    // Parameterized WarpScan
-    typedef WarpScan<Offset> WarpScanAllocations;
-
-    // Callback type for obtaining tile prefix during block scan
-    typedef BlockScanLookbackPrefixOp<
-            Offset,
-            Sum,
-            ScanTileState>
-        LookbackPrefixCallbackOp;
-
-    // Warp exchange type
-    typedef WarpExchange<T, ITEMS_PER_THREAD> WarpExchangeT;
-
-    // Shared memory type for this threadblock
-    struct _TempStorage
-    {
-        union
-        {
-            struct
-            {
-                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
-                typename WarpScanAllocations::TempStorage       warp_scan[WARPS];           // Smem needed for warp-synchronous scans
-                Offset                                          warp_aggregates[WARPS];     // Smem needed for sharing warp-wide aggregates
-                typename LookbackPrefixCallbackOp::TempStorage  prefix;                     // Smem needed for cooperative prefix callback
-            };
-
-            // Smem needed for input loading
-            typename BlockLoadT::TempStorage        load_items;
-
-            // Smem needed for flag loading
-            typename BlockLoadFlags::TempStorage    load_flags;
-
-            // Smem needed for two-phase scatter
-            union
-            {
-                unsigned long long                  align;
-                typename WarpExchangeT::TempStorage exchange[ACTIVE_EXCHANGE_WARPS];
-            };
-        };
-
-        Offset      tile_idx;                   // Shared tile index
-        Offset      tile_inclusive;             // Inclusive tile prefix
-        Offset      tile_exclusive;             // Exclusive tile prefix
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    _TempStorage                    &temp_storage;      ///< Reference to temp_storage
-    WrappedInputIterator            d_in;               ///< Input data
-    WrappedFlagsInputIterator       d_flags;            ///< Input flags
-    SelectedOutputIterator          d_selected_out;     ///< Output data
-    SelectOp                        select_op;          ///< Selection operator
-    InequalityWrapper<EqualityOp>   inequality_op;      ///< Inequality operator
-    Offset                          num_items;          ///< Total number of input items
-
-
-    //---------------------------------------------------------------------
-    // Constructor
-    //---------------------------------------------------------------------
-
-    // Constructor
-    __device__ __forceinline__
-    BlockSelectSweep(
-        TempStorage                 &temp_storage,      ///< Reference to temp_storage
-        InputIterator               d_in,               ///< Input data
-        FlagsInputIterator          d_flags,            ///< Input flags
-        SelectedOutputIterator      d_selected_out,     ///< Output data
-        SelectOp                    select_op,          ///< Selection operator
-        EqualityOp                  equality_op,        ///< Equality operator
-        Offset                      num_items)          ///< Total number of input items
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_flags(d_flags),
-        d_selected_out(d_selected_out),
-        select_op(select_op),
-        inequality_op(equality_op),
-        num_items(num_items)
-    {}
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for initializing the selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Template unrolled selection via selection operator
-     */
-    template <bool FIRST_TILE, bool LAST_TILE, int ITERATION>
-    __device__ __forceinline__ void ApplySelectionOp(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<ITERATION>         iteration)
-    {
-        selected[ITERATION] = 0;
-        if (!LAST_TILE || (Offset(threadIdx.x * ITEMS_PER_THREAD) + ITERATION < num_remaining))
-            selected[ITERATION] = select_op(items[ITERATION]);
-
-        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<ITERATION + 1>());
-    }
-
-    /**
-     * Template unrolled selection via selection operator
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void ApplySelectionOp(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<ITEMS_PER_THREAD>  iteration)
-    {}
-
-    /**
-     * Initialize selections (specialized for selection operator)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_OP>     select_method)
-    {
-        __syncthreads();
-
-        ApplySelectionOp<FIRST_TILE, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<0>());
-    }
-
-
-    /**
-     * Initialize selections (specialized for valid flags)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_SELECT_FLAGS>  select_method)
-    {
-        Flag flags[ITEMS_PER_THREAD];
-
-        if (LAST_TILE)
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags, num_remaining, 0);
-        else
-            BlockLoadFlags(temp_storage.load_flags).Load(d_flags + block_offset, flags);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            selected[ITEM] = flags[ITEM];
-        }
-
-        if (SYNC_AFTER_LOAD)
-            __syncthreads();
-    }
-
-
-    /**
-     * Initialize selections (specialized for discontinuity detection)
-     */
-    template <bool FIRST_TILE, bool LAST_TILE>
-    __device__ __forceinline__ void InitializeSelections(
-        Offset                      block_offset,
-        Offset                      num_remaining,
-        T                           (&items)[ITEMS_PER_THREAD],
-        Offset                      (&selected)[ITEMS_PER_THREAD],
-        Int2Type<USE_DISCONTINUITY> select_method)
-    {
-        if (FIRST_TILE)
-        {
-            // First tile always flags the first item
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op);
-        }
-        else
-        {
-            // Subsequent tiles require the last item from the previous tile
-            T tile_predecessor_item;
-            if (threadIdx.x == 0)
-                tile_predecessor_item = d_in[block_offset - 1];
-
-            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selected, items, inequality_op, tile_predecessor_item);
-        }
-    }
-
-
-    //---------------------------------------------------------------------
-    // Utility methods for scan
-    //---------------------------------------------------------------------
-
-    /**
-     * Scan of allocations
-     */
-    __device__ __forceinline__ void ScanAllocations(
-        Offset  &tile_aggregate,
-        int     &warp_aggregate,
-        int     &warp_exclusive,
-        int     (&selected)[ITEMS_PER_THREAD],
-        int     (&thread_exclusives)[ITEMS_PER_THREAD])
-    {
-        // Perform warpscans
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        int thread_aggregate = ThreadReduce(selected, cub::Sum());
-        int inclusive_partial, exclusive_partial;
-        WarpScanAllocations(temp_storage.warp_scan[warp_id]).Sum(thread_aggregate, inclusive_partial, exclusive_partial);
-        ThreadScanExclusive(selected, thread_exclusives, cub::Sum(), exclusive_partial);
-
-        // Last lane in each warp shares its warp-aggregate
-        if (lane_id == WARP_THREADS - 1)
-            temp_storage.warp_aggregates[warp_id] = inclusive_partial;
-
-        __syncthreads();
-
-        // Accumulate total selected and the warp-wide prefix
-        warp_exclusive   = 0;
-        warp_aggregate   = temp_storage.warp_aggregates[warp_id];
-        tile_aggregate   = temp_storage.warp_aggregates[0];
-
-        #pragma unroll
-        for (int WARP = 1; WARP < WARPS; ++WARP)
-        {
-            if (warp_id == WARP)
-                warp_exclusive = tile_aggregate;
-
-            tile_aggregate += temp_storage.warp_aggregates[WARP];
-        }
-
-        // Push unselected items into the local exchange's guard band
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-        {
-            if (!selected[ITEM])
-                thread_exclusives[ITEM] = WARP_THREADS * ITEMS_PER_THREAD;
-        }
-    }
-
-    //---------------------------------------------------------------------
-    // Utility methods for scattering selections
-    //---------------------------------------------------------------------
-
-    /**
-     * Two-phase scatter, specialized for warp time-slicing
-     */
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset          tile_exclusive,
-        int             warp_aggregate,
-        int             warp_exclusive,
-        int             (&thread_exclusives)[ITEMS_PER_THREAD],
-        T               (&items)[ITEMS_PER_THREAD],
-        Int2Type<true>  is_warp_time_slice)
-    {
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        // Locally compact items within the warp (first warp)
-        if (warp_id == 0)
-        {
-            WarpExchangeT(temp_storage.exchange[0]).ScatterToStriped(items, thread_exclusives);
-        }
-
-        // Locally compact items within the warp (remaining warps)
-        #pragma unroll
-        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
-        {
-            __syncthreads();
-
-            if (warp_id == SLICE)
-            {
-                WarpExchangeT(temp_storage.exchange[0]).ScatterToStriped(items, thread_exclusives);
-            }
-        }
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_aggregate - lane_id)
-            {
-                d_selected_out[tile_exclusive + warp_exclusive + (ITEM * WARP_THREADS) + lane_id] = items[ITEM];
-            }
-        }
-    }
-
-
-
-    /**
-     * Two-phase scatter
-     */
-    __device__ __forceinline__ void ScatterTwoPhase(
-        Offset          tile_exclusive,
-        int             warp_aggregate,
-        int             warp_exclusive,
-        int             (&thread_exclusives)[ITEMS_PER_THREAD],
-        T               (&items)[ITEMS_PER_THREAD],
-        Int2Type<false> is_warp_time_slice)
-    {
-        int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-        int lane_id = LaneId();
-
-        WarpExchangeT(temp_storage.exchange[warp_id]).ScatterToStriped(items, thread_exclusives);
-
-        #pragma unroll
-        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-        {
-            if ((ITEM * WARP_THREADS) < warp_aggregate - lane_id)
-            {
-                d_selected_out[tile_exclusive + warp_exclusive + (ITEM * WARP_THREADS) + lane_id] = items[ITEM];
-            }
-        }
-    }
-
-
-
-    /**
-     * Scatter
-     */
-    __device__ __forceinline__ void Scatter(
-        Offset  tile_aggregate,
-        Offset  tile_exclusive,
-        int     warp_aggregate,
-        int     warp_exclusive,
-        int     (&thread_exclusives)[ITEMS_PER_THREAD],
-        T       (&items)[ITEMS_PER_THREAD])
-    {
-        if ((ITEMS_PER_THREAD == 1) || (tile_aggregate < BLOCK_THREADS))
-        {
-            // Direct scatter if the warp has any items
-            if (warp_aggregate)
-            {
-                #pragma unroll
-                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-                {
-                    if (thread_exclusives[ITEM] < warp_aggregate)
-                        d_selected_out[tile_exclusive + warp_exclusive + thread_exclusives[ITEM]] = items[ITEM];
-                }
-            }
-        }
-        else
-        {
-            ScatterTwoPhase(
-                tile_exclusive,
-                warp_aggregate,
-                warp_exclusive,
-                thread_exclusives,
-                items,
-                Int2Type<STORE_WARP_TIME_SLICING>());
-        }
-    }
-
-
-
-
-
-    //---------------------------------------------------------------------
-    // Cooperatively scan a device-wide sequence of tiles with other CTAs
-    //---------------------------------------------------------------------
-
-    /**
-     * Process a tile of input (dynamic chained scan)
-     */
-    template <bool LAST_TILE>
-    __device__ __forceinline__ Offset ConsumeTile(
-        Offset              num_items,          ///< Total number of input items
-        Offset              num_remaining,      ///< Total number of items remaining to be processed (including this tile)
-        int                 tile_idx,           ///< Tile index
-        Offset              block_offset,       ///< Tile offset
-        ScanTileState       &tile_status)       ///< Global list of tile status
-    {
-        if (tile_idx == 0)
-        {
-            // First tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-            {
-                T oob_item = (SELECT_METHOD == USE_DISCONTINUITY) ?
-                    d_in[num_items - 1] : // Repeat last item
-                    ZeroInitialize<T>();
-
-                BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, oob_item);
-            }
-            else
-            {
-                BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items);
-            }
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Initialize selected/rejected output flags for first tile
-            int selected[ITEMS_PER_THREAD];             // Selection flags
-            InitializeSelections<true, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<SELECT_METHOD>());
-
-            // Scan the selected flags
-            Offset tile_aggregate;
-            int warp_aggregate, warp_exclusive;
-            int thread_exclusives[ITEMS_PER_THREAD];    // Thread exclusive scatter prefixes
-            ScanAllocations(tile_aggregate, warp_aggregate, warp_exclusive, selected, thread_exclusives);
-
-            // Update tile status if there may be successor tiles
-            if (!LAST_TILE && (threadIdx.x == 0))
-                tile_status.SetInclusive(0, tile_aggregate);
-
-            Offset tile_exclusive = 0;
-
-            // Scatter
-            Scatter(tile_aggregate, tile_exclusive, warp_aggregate, warp_exclusive, thread_exclusives, items);
-
-            // Return total number of items selected (inclusive of this tile)
-            return tile_aggregate;
-        }
-        else
-        {
-            // Not first tile
-
-            // Load items
-            T items[ITEMS_PER_THREAD];
-            if (LAST_TILE)
-            {
-                T oob_item = (SELECT_METHOD == USE_DISCONTINUITY) ?
-                    d_in[num_items - 1] : // Repeat last item
-                    ZeroInitialize<T>();
-
-                BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items, num_remaining, oob_item);
-            }
-            else
-            {
-                BlockLoadT(temp_storage.load_items).Load(d_in + block_offset, items);
-            }
-
-            if (SYNC_AFTER_LOAD)
-                __syncthreads();
-
-            // Initialize selected/rejected output flags for non-first tile
-            int selected[ITEMS_PER_THREAD];              // Selection flags
-            InitializeSelections<false, LAST_TILE>(block_offset, num_remaining, items, selected, Int2Type<SELECT_METHOD>());
-
-            // Scan the selected flags
-            Offset tile_aggregate;
-            int warp_aggregate, warp_exclusive;
-            int thread_exclusives[ITEMS_PER_THREAD];       // Scatter offsets
-            ScanAllocations(tile_aggregate, warp_aggregate, warp_exclusive, selected, thread_exclusives);
-
-            // First warp computes tile prefix in lane 0
-            LookbackPrefixCallbackOp prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx);
-            int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
-            if (warp_id == 0)
-            {
-                prefix_op(tile_aggregate);
-                if (threadIdx.x == 0)
-                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
-            }
-
-            __syncthreads();
-
-            Offset tile_exclusive = temp_storage.tile_exclusive;
-
-            // Scatter
-            Scatter(tile_aggregate, tile_exclusive, warp_aggregate, warp_exclusive, thread_exclusives, items);
-
-            // Return total number of items selected (inclusive of this tile)
-            return prefix_op.inclusive_prefix;
-        }
-    }
-
-
-    /**
-     * Dequeue and scan tiles of items as part of a dynamic chained scan
-     */
-    template <typename NumSelectedIterator>         ///< Output iterator type for recording number of items selected
-    __device__ __forceinline__ void ConsumeRange(
-        int                     num_tiles,          ///< Total number of input tiles
-        GridQueue<int>          queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        ScanTileState           &tile_status,       ///< Global list of tile status
-        NumSelectedIterator     d_num_selected_out)     ///< Output total number selected
-    {
-
-#if __CUDA_ARCH__ > 130
-
-        // Blocks may not be launched in increasing order, so work-steal tiles
-        if (threadIdx.x == 0)
-            temp_storage.tile_idx = queue.Drain(1);
-
-        __syncthreads();
-
-        int tile_idx = temp_storage.tile_idx;
-
-#else
-
-        // Blocks are launched in increasing order, so just assign one tile per block
-        int tile_idx = (blockIdx.y * gridDim.x) + blockIdx.x;
-
-#endif
-
-        Offset  block_offset    = Offset(TILE_ITEMS) * tile_idx;            // Global offset for the current tile
-        Offset  num_remaining   = num_items - block_offset;                 // Remaining items (including this tile)
-
-        if (num_remaining > 0)
-        {
-            if (num_remaining > TILE_ITEMS)
-            {
-                // Full tile
-                ConsumeTile<false>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-            }
-            else
-            {
-                // Last tile
-                Offset total_selected = ConsumeTile<true>(num_items, num_remaining, tile_idx, block_offset, tile_status);
-
-                // Output the total number of items selected
-                if (threadIdx.x == 0)
-                {
-                    *d_num_selected_out = total_selected;
-                }
-            }
-        }
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh
deleted file mode 100644
index 39b068372..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_gatomic_sweep.cuh
+++ /dev/null
@@ -1,184 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramSweepGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram.
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * BlockHistogramSweepGlobalAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using global atomics
- */
-template <
-    typename    BlockHistogramSweepPolicy,      ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockHistogramSweepGlobalAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramSweepPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    // Shared memory type required by this thread block
-    typedef NullType TempStorage;
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramSweepGlobalAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {}
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(d_out_histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item  = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(d_out_histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {}
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh
deleted file mode 100644
index 9f2bebf29..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_satomic_sweep.cuh
+++ /dev/null
@@ -1,245 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramSweepSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockHistogramSweepSharedAtomic implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using shared atomics
- */
-template <
-    typename    BlockHistogramSweepPolicy,		///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                	///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockHistogramSweepSharedAtomic
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramSweepPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS  = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS          = TILE_CHANNEL_ITEMS * CHANNELS,
-    };
-
-    /// Shared memory type required by this thread block
-    struct _TempStorage
-    {
-        HistoCounter histograms[ACTIVE_CHANNELS][BINS + 1];  // One word of padding between channel histograms to prevent warps working on different histograms from hammering on the same bank
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramSweepSharedAtomic(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram bin counts to zeros
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int histo_offset = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                this->temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x] = 0;
-            }
-        }
-
-        __syncthreads();
-    }
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD][CHANNELS];
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        items[ITEM][CHANNEL] = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                    }
-                }
-            }
-
-            __threadfence_block();
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (CHANNEL < ACTIVE_CHANNELS)
-                    {
-                        atomicAdd(temp_storage.histograms[CHANNEL] + items[ITEM][CHANNEL], 1);
-                    }
-                }
-            }
-
-            __threadfence_block();
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            int bounds = valid_items - (threadIdx.x * CHANNELS);
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
-            {
-                #pragma unroll
-                for (int CHANNEL = 0; CHANNEL < CHANNELS; ++CHANNEL)
-                {
-                    if (((ACTIVE_CHANNELS == CHANNELS) || (CHANNEL < ACTIVE_CHANNELS)) && ((ITEM * BLOCK_THREADS * CHANNELS) + CHANNEL < bounds))
-                    {
-                        SampleT item = d_in[block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS) + CHANNEL];
-                        atomicAdd(temp_storage.histograms[CHANNEL] + item, 1);
-                    }
-                }
-            }
-
-        }
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Barrier to ensure shared memory histograms are coherent
-        __syncthreads();
-
-        // Copy shared memory histograms to output
-        int channel_offset = (blockIdx.x * BINS);
-
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int histo_offset = 0;
-
-            #pragma unroll
-            for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
-            {
-                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
-            }
-
-            // Finish up with guarded initialization if necessary
-            if ((BINS % BLOCK_THREADS != 0) && (histo_offset + threadIdx.x < BINS))
-            {
-                HistoCounter count = temp_storage.histograms[CHANNEL][histo_offset + threadIdx.x];
-
-                d_out_histograms[CHANNEL][channel_offset + histo_offset + threadIdx.x] = count;
-            }
-        }
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh b/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh
deleted file mode 100644
index bed31ed2c..000000000
--- a/thrust/system/cuda/detail/cub/block_sweep/specializations/block_histogram_sort_sweep.cuh
+++ /dev/null
@@ -1,364 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::BlockHistogramSweepSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-
-#pragma once
-
-#include <iterator>
-
-#include "../../block/block_radix_sort.cuh"
-#include "../../block/block_discontinuity.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * BlockHistogramSweepSort implements a stateful abstraction of CUDA thread blocks for histogramming multiple tiles as part of device-wide histogram using local sorting
- */
-template <
-    typename    BlockHistogramSweepPolicy,          ///< Tuning policy
-    int         BINS,                           ///< Number of histogram bins per channel
-    int         CHANNELS,                       ///< Number of channels interleaved in the input data (may be greater than the number of active channels being histogrammed)
-    int         ACTIVE_CHANNELS,                ///< Number of channels actively being histogrammed
-    typename    InputIterator,                ///< The input iterator type \iterator.  Must have an an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1]
-    typename    HistoCounter,                   ///< Integer type for counting sample occurrences per histogram bin
-    typename    Offset>                          ///< Signed integer type for global offsets
-struct BlockHistogramSweepSort
-{
-    //---------------------------------------------------------------------
-    // Types and constants
-    //---------------------------------------------------------------------
-
-    // Sample type
-    typedef typename std::iterator_traits<InputIterator>::value_type SampleT;
-
-    // Constants
-    enum
-    {
-        BLOCK_THREADS               = BlockHistogramSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD            = BlockHistogramSweepPolicy::ITEMS_PER_THREAD,
-        TILE_CHANNEL_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
-        TILE_ITEMS                  = TILE_CHANNEL_ITEMS * CHANNELS,
-
-        STRIPED_COUNTERS_PER_THREAD = (BINS + BLOCK_THREADS - 1) / BLOCK_THREADS,
-    };
-
-    // Parameterize BlockRadixSort type for our thread block
-    typedef BlockRadixSort<SampleT, BLOCK_THREADS, ITEMS_PER_THREAD> BlockRadixSortT;
-
-    // Parameterize BlockDiscontinuity type for our thread block
-    typedef BlockDiscontinuity<SampleT, BLOCK_THREADS> BlockDiscontinuityT;
-
-    /// Shared memory type required by this thread block
-    union _TempStorage
-    {
-        // Storage for sorting bin values
-        typename BlockRadixSortT::TempStorage sort;
-
-        struct
-        {
-            // Storage for detecting discontinuities in the tile of sorted bin values
-            typename BlockDiscontinuityT::TempStorage flag;
-
-            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
-            int run_begin[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-            int run_end[BLOCK_THREADS * STRIPED_COUNTERS_PER_THREAD];
-        };
-    };
-
-
-    /// Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    // Discontinuity functor
-    struct DiscontinuityOp
-    {
-        // Reference to temp_storage
-        _TempStorage &temp_storage;
-
-        // Constructor
-        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
-            temp_storage(temp_storage)
-        {}
-
-        // Discontinuity predicate
-        __device__ __forceinline__ bool operator()(const SampleT &a, const SampleT &b, int b_index)
-        {
-            if (a != b)
-            {
-                // Note the begin/end offsets in shared storage
-                temp_storage.run_begin[b] = b_index;
-                temp_storage.run_end[a] = b_index;
-
-                return true;
-            }
-            else
-            {
-                return false;
-            }
-        }
-    };
-
-
-    //---------------------------------------------------------------------
-    // Per-thread fields
-    //---------------------------------------------------------------------
-
-    /// Reference to temp_storage
-    _TempStorage &temp_storage;
-
-    /// Histogram counters striped across threads
-    HistoCounter thread_counters[ACTIVE_CHANNELS][STRIPED_COUNTERS_PER_THREAD];
-
-    /// Reference to output histograms
-    HistoCounter* (&d_out_histograms)[ACTIVE_CHANNELS];
-
-    /// Input data to reduce
-    InputIterator d_in;
-
-
-    //---------------------------------------------------------------------
-    // Interface
-    //---------------------------------------------------------------------
-
-    /**
-     * Constructor
-     */
-    __device__ __forceinline__ BlockHistogramSweepSort(
-        TempStorage         &temp_storage,                                  ///< Reference to temp_storage
-        InputIterator     d_in,                                           ///< Input data to reduce
-        HistoCounter*       (&d_out_histograms)[ACTIVE_CHANNELS])           ///< Reference to output histograms
-    :
-        temp_storage(temp_storage.Alias()),
-        d_in(d_in),
-        d_out_histograms(d_out_histograms)
-    {
-        // Initialize histogram counters striped across threads
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                thread_counters[CHANNEL][COUNTER] = 0;
-            }
-        }
-    }
-
-
-    /**
-     * Composite a tile of input items
-     */
-    __device__ __forceinline__ void Composite(
-        SampleT   (&items)[ITEMS_PER_THREAD],                     ///< Tile of samples
-        HistoCounter    thread_counters[STRIPED_COUNTERS_PER_THREAD])   ///< Histogram counters striped across threads
-    {
-        // Sort bytes in blocked arrangement
-        BlockRadixSortT(temp_storage.sort).Sort(items);
-
-        __syncthreads();
-
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            temp_storage.run_begin[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-            temp_storage.run_end[(COUNTER * BLOCK_THREADS) + threadIdx.x] = TILE_CHANNEL_ITEMS;
-        }
-
-        __syncthreads();
-
-        // Note the begin/end run offsets of bin runs in the sorted tile
-        int flags[ITEMS_PER_THREAD];                // unused
-        DiscontinuityOp flag_op(temp_storage);
-        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
-
-        // Update begin for first item
-        if (threadIdx.x == 0) temp_storage.run_begin[items[0]] = 0;
-
-        __syncthreads();
-
-        // Composite into histogram
-        // Initialize the shared memory's run_begin and run_end for each bin
-        #pragma unroll
-        for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-        {
-            int          bin            = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-            HistoCounter run_length     = temp_storage.run_end[bin] - temp_storage.run_begin[bin];
-
-            thread_counters[COUNTER] += run_length;
-        }
-    }
-
-
-    /**
-     * Process one channel within a tile.
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTileChannel(
-        int     channel,
-        Offset   block_offset,
-        int     valid_items)
-    {
-        // Load items in striped fashion
-        if (FULL_TILE)
-        {
-            // Full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Unguarded loads
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)];
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-        }
-        else
-        {
-            // Only a partially-full tile of samples to read and composite
-            SampleT items[ITEMS_PER_THREAD];
-
-            // Assign our tid as the bin for out-of-bounds items (to give an even distribution), and keep track of how oob items to subtract out later
-            int bounds = (valid_items - (threadIdx.x * CHANNELS));
-
-            #pragma unroll
-            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
-            {
-                items[ITEM] = ((ITEM * BLOCK_THREADS * CHANNELS) < bounds) ?
-                    d_in[channel + block_offset + (ITEM * BLOCK_THREADS * CHANNELS) + (threadIdx.x * CHANNELS)] :
-                    0;
-            }
-
-            // Composite our histogram data
-            Composite(items, thread_counters[channel]);
-
-            __syncthreads();
-
-            // Correct the overcounting in the zero-bin from invalid (out-of-bounds) items
-            if (threadIdx.x == 0)
-            {
-                int extra = (TILE_ITEMS - valid_items) / CHANNELS;
-                thread_counters[channel][0] -= extra;
-            }
-        }
-    }
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Inductive step.
-     */
-    template <bool FULL_TILE, int CHANNEL, int END>
-    struct IterateChannels
-    {
-        /**
-         * Process one channel within a tile.
-         */
-        static __device__ __forceinline__ void ConsumeTileChannel(
-            BlockHistogramSweepSort *cta,
-            Offset               block_offset,
-            int                 valid_items)
-        {
-            __syncthreads();
-
-            cta->ConsumeTileChannel<FULL_TILE>(CHANNEL, block_offset, valid_items);
-
-            IterateChannels<FULL_TILE, CHANNEL + 1, END>::ConsumeTileChannel(cta, block_offset, valid_items);
-        }
-    };
-
-
-    /**
-     * Template iteration over channels (to silence not-unrolled warnings for SM10-13).  Base step.
-     */
-    template <bool FULL_TILE, int END>
-    struct IterateChannels<FULL_TILE, END, END>
-    {
-        static __device__ __forceinline__ void ConsumeTileChannel(BlockHistogramSweepSort *cta, Offset block_offset, int valid_items) {}
-    };
-
-
-    /**
-     * Process a single tile of input
-     */
-    template <bool FULL_TILE>
-    __device__ __forceinline__ void ConsumeTile(
-        Offset   block_offset,               ///< The offset the tile to consume
-        int     valid_items = TILE_ITEMS)   ///< The number of valid items in the tile
-    {
-        // First channel
-        ConsumeTileChannel<FULL_TILE>(0, block_offset, valid_items);
-
-        // Iterate through remaining channels
-        IterateChannels<FULL_TILE, 1, ACTIVE_CHANNELS>::ConsumeTileChannel(this, block_offset, valid_items);
-    }
-
-
-    /**
-     * Aggregate results into output
-     */
-    __device__ __forceinline__ void AggregateOutput()
-    {
-        // Copy counters striped across threads into the histogram output
-        #pragma unroll
-        for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-        {
-            int channel_offset  = (blockIdx.x * BINS);
-
-            #pragma unroll
-            for (int COUNTER = 0; COUNTER < STRIPED_COUNTERS_PER_THREAD; ++COUNTER)
-            {
-                int bin = (COUNTER * BLOCK_THREADS) + threadIdx.x;
-
-                if ((STRIPED_COUNTERS_PER_THREAD * BLOCK_THREADS == BINS) || (bin < BINS))
-                {
-                    d_out_histograms[CHANNEL][channel_offset + bin] = thread_counters[CHANNEL][COUNTER];
-                }
-            }
-        }
-    }
-};
-
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/cub.cuh b/thrust/system/cuda/detail/cub/cub.cuh
deleted file mode 100644
index a0902ba85..000000000
--- a/thrust/system/cuda/detail/cub/cub.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * CUB umbrella include file
- */
-
-#pragma once
-
-
-// Block
-#include "block/block_histogram.cuh"
-#include "block/block_discontinuity.cuh"
-#include "block/block_exchange.cuh"
-#include "block/block_load.cuh"
-#include "block/block_radix_rank.cuh"
-#include "block/block_radix_sort.cuh"
-#include "block/block_reduce.cuh"
-#include "block/block_scan.cuh"
-#include "block/block_store.cuh"
-#include "block/block_shift.cuh"
-
-// Device
-#include "device/device_histogram.cuh"
-#include "device/device_partition.cuh"
-#include "device/device_radix_sort.cuh"
-#include "device/device_reduce.cuh"
-#include "device/device_scan.cuh"
-#include "device/device_select.cuh"
-
-// Grid
-//#include "grid/grid_barrier.cuh"
-#include "grid/grid_even_share.cuh"
-#include "grid/grid_mapping.cuh"
-#include "grid/grid_queue.cuh"
-
-// Host
-#include "host/spinlock.cuh"
-
-// Thread
-#include "thread/thread_load.cuh"
-#include "thread/thread_operators.cuh"
-#include "thread/thread_reduce.cuh"
-#include "thread/thread_scan.cuh"
-#include "thread/thread_store.cuh"
-
-// Warp
-#include "warp/warp_reduce.cuh"
-#include "warp/warp_scan.cuh"
-
-// Iterator
-#include "iterator/arg_index_input_iterator.cuh"
-#include "iterator/cache_modified_input_iterator.cuh"
-#include "iterator/cache_modified_output_iterator.cuh"
-#include "iterator/constant_input_iterator.cuh"
-#include "iterator/counting_input_iterator.cuh"
-#include "iterator/tex_obj_input_iterator.cuh"
-#include "iterator/tex_ref_input_iterator.cuh"
-#include "iterator/transform_input_iterator.cuh"
-
-// Util
-#include "util_allocator.cuh"
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_device.cuh"
-#include "util_macro.cuh"
-#include "util_ptx.cuh"
-#include "util_type.cuh"
-
diff --git a/thrust/system/cuda/detail/cub/device/device_histogram.cuh b/thrust/system/cuda/detail/cub/device/device_histogram.cuh
deleted file mode 100644
index 1ce687e20..000000000
--- a/thrust/system/cuda/detail/cub/device/device_histogram.cuh
+++ /dev/null
@@ -1,653 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_histogram_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory. ![](histogram_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
- * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
- *
- * \par Usage Considerations
- * \cdp_class{DeviceHistogram}
- *
- * \par Performance
- *
- * \image html histo_perf.png
- *
- */
-struct DeviceHistogram
-{
-    /******************************************************************//**
-     * \name Single-channel samples
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide histogram using fast block-wide sorting.
-     *
-     * \par
-     * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     * - Delivers consistent throughput regardless of sample diversity
-     * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a 8-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histogram
-     * int              num_samples;    // e.g., 12
-     * unsigned char    *d_samples;     // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2]
-     * unsigned int     *d_histogram;   // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [2, 1, 3, 1, 0, 1, 2, 2]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SingleChannelSorting(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-                DEVICE_HISTO_SORT,
-                BINS,
-                1,
-                1,
-                InputIterator,
-                HistoCounter,
-                Offset>
-            DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            &d_histogram,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram using shared-memory atomic read-modify-write operations.
-     *
-     * \par
-     * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions.
-     * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a 8-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histogram
-     * int              num_samples;    // e.g., 12
-     * unsigned char    *d_samples;     // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2]
-     * unsigned int     *d_histogram;   // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelSharedAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [2, 1, 3, 1, 0, 1, 2, 2]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SingleChannelSharedAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-                DEVICE_HISTO_SHARED_ATOMIC,
-                BINS,
-                1,
-                1,
-                InputIterator,
-                HistoCounter,
-                Offset>
-            DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            &d_histogram,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram using global-memory atomic read-modify-write operations.
-     *
-     * \par
-     * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions.
-     * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of a 8-bin histogram of
-     * single-channel <tt>unsigned char</tt> samples.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histogram
-     * int              num_samples;    // e.g., 12
-     * unsigned char    *d_samples;     // e.g., [2, 6, 7, 5, 3, 0, 2, 1, 7, 0, 6, 2]
-     * unsigned int     *d_histogram;   // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::SingleChannelSorting<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histogram
-     * cub::DeviceHistogram::SingleChannelGlobalAtomic<8>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histogram, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [2, 1, 3, 1, 0, 1, 2, 2]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SingleChannelGlobalAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Input samples
-        HistoCounter*       d_histogram,                        ///< [out] Array of BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-                DEVICE_HISTO_GLOBAL_ATOMIC,
-                BINS,
-                1,
-                1,
-                InputIterator,
-                HistoCounter,
-                Offset>
-            DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            &d_histogram,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Interleaved multi-channel samples
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data using fast block-sorting.
-     *
-     * \par
-     * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     * - Delivers consistent throughput regardless of sample diversity
-     * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * an input sequence of quad-channel (interleaved) <tt>unsigned char</tt> samples.
-     * (E.g., RGB histograms from RGBA pixel samples.)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histograms
-     * int           num_samples;     // e.g., 20 (five pixels with four channels each)
-     * unsigned char *d_samples;      // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * unsigned int  *d_histogram[3]; // e.g., [ [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ] ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelSorting<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1];
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0];
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2] ]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiChannelSorting(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-            DEVICE_HISTO_SORT,
-            BINS,
-            CHANNELS,
-            ACTIVE_CHANNELS,
-            InputIterator,
-            HistoCounter,
-            Offset> DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histograms,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data using shared-memory atomic read-modify-write operations.
-     *
-     * \par
-     * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions.
-     * - Histograms having a large number of bins (e.g., thousands) may adversely affect shared memory occupancy and performance (or even the ability to launch).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * an input sequence of quad-channel (interleaved) <tt>unsigned char</tt> samples.
-     * (E.g., RGB histograms from RGBA pixel samples.)
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histograms
-     * int           num_samples;     // e.g., 20 (five pixels with four channels each)
-     * unsigned char *d_samples;      // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * unsigned int  *d_histogram[3]; // e.g., [ [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ] ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelSharedAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1];
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0];
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2] ]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiChannelSharedAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-            DEVICE_HISTO_SHARED_ATOMIC,
-            BINS,
-            CHANNELS,
-            ACTIVE_CHANNELS,
-            InputIterator,
-            HistoCounter,
-            Offset> DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histograms,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide histogram from multi-channel data using global-memory atomic read-modify-write operations.
-     *
-     * \par
-     * - The total number of samples across all channels (\p num_samples) must be a whole multiple of \p CHANNELS.
-     * - Input samples having lower diversity can cause performance to be degraded due to serializations from bin-collisions.
-     * - Performance is not significantly impacted when computing histograms having large numbers of bins (e.g., thousands).
-     * - Performance is often improved when referencing input samples through a texture-caching iterator (e.g., cub::TexObjInputIterator).
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the computation of three 256-bin histograms from
-     * an input sequence of quad-channel (interleaved) <tt>unsigned char</tt> samples.
-     * (E.g., RGB histograms from RGBA pixel samples.)
-     *
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and histograms
-     * int           num_samples;     // e.g., 20 (five pixels with four channels each)
-     * unsigned char *d_samples;      // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
-     *                                //        (0, 6, 7, 5), (3, 0, 2, 6)]
-     * unsigned int  *d_histogram[3]; // e.g., [ [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ];
-     *                                //         [ ,  ,  ,  ,  ,  ,  ,  ] ]
-     * ...
-     *
-     * // Wrap d_samples device pointer in a random-access texture iterator
-     * cub::TexObjInputIterator<unsigned char> d_samples_tex_itr;
-     * d_samples_tex_itr.BindTexture(d_samples, num_samples * sizeof(unsigned char));
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Compute histograms
-     * cub::DeviceHistogram::MultiChannelGlobalAtomic<8, 4, 3>(d_temp_storage, temp_storage_bytes, d_samples_tex_itr, d_histograms, num_samples);
-     *
-     * // Unbind texture iterator
-     * d_samples_tex_itr.UnbindTexture();
-     *
-     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1];
-     * //                     [0, 3, 0, 0, 0, 0, 2, 0];
-     * //                     [0, 0, 2, 0, 0, 0, 1, 2] ]
-     *
-     * \endcode
-     *
-     * \tparam BINS                 Number of histogram bins per channel
-     * \tparam CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-     * \tparam ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input samples. (Must have an InputIterator::value_type that, when cast as an integer, falls in the range [0..BINS-1])  \iterator
-     * \tparam HistoCounter         <b>[inferred]</b> Integer type for counting sample occurrences per histogram bin
-     */
-    template <
-        int                 BINS,
-        int                 CHANNELS,
-        int                 ACTIVE_CHANNELS,
-        typename            InputIterator,
-        typename            HistoCounter>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t MultiChannelGlobalAtomic(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of active channel histogram pointers, each pointing to an output array having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Total number of samples to process in all channels, including non-active channels
-        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceHistogramDispatch<
-                DEVICE_HISTO_GLOBAL_ATOMIC,
-                BINS,
-                CHANNELS,
-                ACTIVE_CHANNELS,
-                InputIterator,
-                HistoCounter,
-                Offset>
-            DeviceHistogramDispatch;
-
-        return DeviceHistogramDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_samples,
-            d_histograms,
-            num_samples,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_device_histogram.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_partition.cuh b/thrust/system/cuda/detail/cub/device/device_partition.cuh
deleted file mode 100644
index 4a4be1f68..000000000
--- a/thrust/system/cuda/detail/cub/device/device_partition.cuh
+++ /dev/null
@@ -1,275 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_select_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within global memory. ![](partition_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
- * a specified input sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DevicePartition}
- *
- * \par Performance
- * \linear_performance{partition}
- *
- * \par
- * The following chart illustrates DevicePartition::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected for the first partition.
- * \plots_below
- *
- * \image html partition_if_int32_50_percent.png
- *
- */
-struct DevicePartition
-{
-    /**
-     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIterator       <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIterator  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    FlagIterator,
-        typename                    OutputIterator,
-        typename                    NumSelectedIterator>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     Offset;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DeviceSelectDispatch<InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, SelectOp, EqualityOp, Offset, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original
-     *   relative ordering, however copies of the unselected items are compacted into the
-     *   rear of \p d_out in reverse order.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated partition-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected for the first partition with 50% probability.
-     *
-     * \image html partition_if_int32_50_percent.png
-     * \image html partition_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability for the first partition:
-     *
-     * \image html partition_if_int32_5_percent.png
-     * \image html partition_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator       <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
-     * \tparam NumSelectedIterator  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator,
-        typename                    NumSelectedIterator,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
-        int                         num_items,                      ///< [in] Total number of items to select from
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     Offset;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // Flag iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DeviceSelectDispatch<InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, SelectOp, EqualityOp, Offset, true>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_partition_flagged.cu
- * \example example_device_partition_if.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh b/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
deleted file mode 100644
index 384831aa1..000000000
--- a/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh
+++ /dev/null
@@ -1,420 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_radix_sort_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory. ![](sorting_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
- * items into ascending order.  It relies upon a positional representation for
- * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
- * characters, etc.) specified from least-significant to most-significant.  For a
- * given input sequence of keys and a set of rules specifying a total ordering
- * of the symbolic alphabet, the radix sorting method produces a lexicographic
- * ordering of those keys.
- *
- * \par
- * DeviceRadixSort can sort all of the built-in C++ numeric primitive types, e.g.:
- * <tt>unsigned char</tt>, \p int, \p double, etc.  Although the direct radix sorting
- * method can only be applied to unsigned integral types, BlockRadixSort
- * is able to sort signed and floating-point types via simple bit-wise transformations
- * that ensure lexicographic key ordering.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRadixSort}
- *
- * \par Performance
- * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
- * performance across different CUDA architectures for uniform-random \p uint32 keys.
- * \plots_below
- *
- * \image html lsb_radix_sort_int32_keys.png
- *
- */
-struct DeviceRadixSort
-{
-    /**
-     * \brief Sorts key-value pairs into ascending order.
-     *
-     * \par
-     * - The sorting operation requires a pair of key buffers and a pair of value
-     *   buffers.  Each pair is wrapped in a DoubleBuffer structure whose member
-     *   DoubleBuffer::Current() references the active buffer.  The currently-active
-     *   buffer may be changed by the sorting operation.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
-     * <tt>uint64,uint64</tt> pairs, respectively.
-     *
-     * \image html lsb_radix_sort_int32_pairs.png
-     * \image html lsb_radix_sort_int64_pairs.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     * \tparam Value    <b>[inferred]</b> Value type
-     */
-    template <
-        typename            Key,
-        typename            Value>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairs(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer of values whose "current" buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        return DeviceRadixSortDispatch<false, Key, Value, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts key-value pairs into descending order.
-     *
-     * \par
-     * - The sorting operation requires a pair of key buffers and a pair of value
-     *   buffers.  Each pair is wrapped in a DoubleBuffer structure whose member
-     *   DoubleBuffer::Current() references the active buffer.  The currently-active
-     *   buffer may be changed by the sorting operation.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortPairs.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys
-     * with associated vector of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
-     * int  *d_value_alt_buf;   // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a set of DoubleBuffers to wrap pairs of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     * \tparam Value    <b>[inferred]</b> Value type
-     */
-    template <
-        typename            Key,
-        typename            Value>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortPairsDescending(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value> &d_values,                              ///< [in,out] Double-buffer of values whose "current" buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        return DeviceRadixSortDispatch<true, Key, Value, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order
-     *
-     * \par
-     * - The sorting operation requires a pair of key buffers.  The pair is
-     *   wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
-     *   references the active buffer.  The currently-active buffer may be changed
-     *   by the sorting operation.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated sorting performance across different
-     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
-     *
-     * \image html lsb_radix_sort_int32_keys.png
-     * \image html lsb_radix_sort_int64_keys.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     */
-    template <typename Key>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeys(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DeviceRadixSortDispatch<false, Key, NullType, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Sorts keys into ascending order
-     *
-     * \par
-     * - The sorting operation requires a pair of key buffers.  The pair is
-     *   wrapped in a DoubleBuffer structure whose member DoubleBuffer::Current()
-     *   references the active buffer.  The currently-active buffer may be changed
-     *   by the sorting operation.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is similar to DeviceRadixSort::SortKeys.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sorting of a device vector of \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for sorting data
-     * int  num_items;          // e.g., 7
-     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_key_alt_buf;     // e.g., [        ...        ]
-     * ...
-     *
-     * // Create a DoubleBuffer to wrap the pair of device pointers
-     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sorting operation
-     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
-     *
-     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam Key      <b>[inferred]</b> Key type
-     */
-    template <typename Key>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t SortKeysDescending(
-        void                *d_temp_storage,                        ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>   &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        int                 num_items,                              ///< [in] Number of items to reduce
-        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The first (least-significant) bit index needed for key comparison
-        int                 end_bit             = sizeof(Key) * 8,  ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Null value type
-        DoubleBuffer<NullType> d_values;
-
-        return DeviceRadixSortDispatch<true, Key, NullType, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_radix_sort.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_reduce.cuh b/thrust/system/cuda/detail/cub/device/device_reduce.cuh
deleted file mode 100644
index 3c20cec5d..000000000
--- a/thrust/system/cuda/detail/cub/device/device_reduce.cuh
+++ /dev/null
@@ -1,684 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_reduce_dispatch.cuh"
-#include "dispatch/device_reduce_by_key_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory. ![](reduce_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceReduce}
- *
- * \par Performance
- * \linear_performance{reduction, reduce-by-key, and run-length encode}
- *
- * \par
- * The following chart illustrates DeviceReduce::Sum
- * performance across different CUDA architectures for \p int32 keys.
- *
- * \image html reduce_int32.png
- *
- * \par
- * The following chart illustrates DeviceReduce::ReduceByKey (summation)
- * performance across different CUDA architectures for \p fp32
- * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
- *
- * \image html reduce_by_key_fp32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceReduce
-{
-    /**
-     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor.
-     *
-     * \par
-     * - Does not support non-commutative reduction operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceReduce::Sum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates a custom min reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;  // e.g., 7
-     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;     // e.g., [ ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduction
-     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     * \tparam ReductionOp        <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator,
-        typename                    ReductionOp>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Reduce(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOp                 reduction_op,                       ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, ReductionOp> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            reduction_op,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide sum using the addition ('+') operator.
-     *
-     * \par
-     * - Does not support non-commutative reduction operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated reduction (sum) performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html reduce_int32.png
-     * \image html reduce_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the sum reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run sum-reduction
-     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_sum, num_items);
-     *
-     * // d_out <-- [38]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Sum(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Sum> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Sum(),
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide minimum using the less-than ('<') operator.
-     *
-     * \par
-     * - Does not support non-commutative minimum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceReduce::Sum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the min-reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run min-reduction
-     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_min, num_items);
-     *
-     * // d_out <-- [0]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Min(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Min> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Min(),
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
-     *
-     * \par
-     * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type
-     * <tt>ItemOffsetPair<T, int></tt>.  The minimum value is written to <tt>d_out.value</tt> and its
-     * location in the input array is written to <tt>d_out.offset</tt>.
-     *
-     * \par
-     * - Does not support non-commutative minimum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceReduce::Sum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmin-reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * ItemOffsetPair<int, int> *d_out;         // e.g., [{ , }]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmin-reduction
-     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
-     *
-     * // d_out <-- [{0, 5}]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>ItemOffsetPair<T, int></tt>) \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMin(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Wrapped input iterator
-        typedef ArgIndexInputIterator<InputIterator, int> ArgIndexInputIterator;
-        ArgIndexInputIterator d_argmin_in(d_in, 0);
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<ArgIndexInputIterator, OutputIterator, Offset, cub::ArgMin> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_argmin_in,
-            d_out,
-            num_items,
-            cub::ArgMin(),
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
-     *
-     * \par
-     * - Does not support non-commutative maximum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceReduce::Sum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the max-reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run max-reduction
-     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
-     *
-     * // d_out <-- [9]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t Max(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<InputIterator, OutputIterator, Offset, cub::Max> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            num_items,
-            cub::Max(),
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
-     *
-     * \par
-     * Assuming the input \p d_in has value type \p T, the output \p d_out must have value type
-     * <tt>ItemOffsetPair<T, int></tt>.  The maximum value is written to <tt>d_out.value</tt> and its
-     * location in the input array is written to <tt>d_out.offset</tt>.
-     *
-     * \par
-     * - Does not support non-commutative maximum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceReduce::Sum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the argmax-reduction of a device vector of \p int items.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int                      num_items;      // e.g., 7
-     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * ItemOffsetPair<int, int> *d_out;         // e.g., [{ , }]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run argmax-reduction
-     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
-     *
-     * // d_out <-- [{9, 6}]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>ItemOffsetPair<T, int></tt>) \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ArgMax(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Wrapped input iterator
-        typedef ArgIndexInputIterator<InputIterator, int> ArgIndexInputIterator;
-        ArgIndexInputIterator d_argmax_in(d_in, 0);
-
-        // Dispatch type
-        typedef DeviceReduceDispatch<ArgIndexInputIterator, OutputIterator, Offset, cub::ArgMax> DeviceReduceDispatch;
-
-        return DeviceReduceDispatch::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_argmax_in,
-            d_out,
-            num_items,
-            cub::ArgMax(),
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
-     *
-     * \par
-     * This operation computes segmented reductions within \p d_values_in using
-     * the specified binary \p reduction_op functor.  The segments are identified by
-     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
-     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
-     * the first key of the run and the corresponding value aggregate of that run are
-     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
-     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following chart illustrates reduction-by-key (sum) performance across
-     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
-     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
-     *
-     * \image html reduce_by_key_fp32_len_500.png
-     * \image html reduce_by_key_fp64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html reduce_by_key_fp32_len_5.png
-     * \image html reduce_by_key_fp64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the segmented reduction of \p int values grouped
-     * by runs of associated \p int keys.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
-     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_aggregates_out;  // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;        // e.g., [ ]
-     * CustomMin    reduction_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run reduce-by-key
-     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
-     * // d_num_runs_out        <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam KeysInputIterator        <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
-     * \tparam UniqueOutputIterator     <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
-     * \tparam ValuesInputIterator      <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
-     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
-     * \tparam NumRunsOutputIterator    <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     * \tparam ReductionOp              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename                    KeysInputIterator,
-        typename                    UniqueOutputIterator,
-        typename                    ValuesInputIterator,
-        typename                    AggregatesOutputIterator,
-        typename                    NumRunsOutputIterator,
-        typename                    ReductionOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t ReduceByKey(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIterator           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIterator        d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIterator         d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIterator    d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        ReductionOp                 reduction_op,                   ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                 Offset;         // Signed integer type for global offsets
-        typedef NullType*           FlagIterator;   // Flag iterator type (not used)
-        typedef NullType            SelectOp;       // Selection op (not used)
-        typedef Equality            EqualityOp;     // Default == operator
-
-        return DeviceReduceByKeyDispatch<KeysInputIterator, UniqueOutputIterator, ValuesInputIterator, AggregatesOutputIterator, NumRunsOutputIterator, EqualityOp, ReductionOp, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys_in,
-            d_unique_out,
-            d_values_in,
-            d_aggregates_out,
-            d_num_runs_out,
-            EqualityOp(),
-            reduction_op,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_reduce.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh b/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
deleted file mode 100644
index 0bd4b47ff..000000000
--- a/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh
+++ /dev/null
@@ -1,281 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_rle_dispatch.cuh"
-#include "dispatch/device_reduce_by_key_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within global memory. ![](run_length_encode_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
- * computes a simple compressed representation of a sequence of input elements such that each
- * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
- * count of the elements in that run.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceRunLengthEncode}
- *
- * \par Performance
- * \linear_performance{run-length encode}
- *
- * \par
- * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
- * different CUDA architectures for \p int32 items.
- * Segments have lengths uniformly sampled from [1,1000].
- *
- * \image html rle_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceRunLengthEncode
-{
-
-    /**
-     * \brief Computes a run-length encoding of the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
-     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
-     *   respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated encode performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html rle_int32_len_500.png
-     * \image html rle_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html rle_int32_len_5.png
-     * \image html rle_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
-     *
-     * // d_unique_out      <-- [0, 2, 9, 5, 8]
-     * // d_counts_out      <-- [1, 2, 1, 3, 1]
-     * // d_num_runs_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator            <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam UniqueOutputIterator     <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
-     * \tparam LengthsOutputIterator    <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
-     * \tparam NumRunsOutputIterator    <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    UniqueOutputIterator,
-        typename                    LengthsOutputIterator,
-        typename                    NumRunsOutputIterator>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Encode(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIterator        d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        LengthsOutputIterator       d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
-        NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to total number of runs
-        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Data type of value iterator
-        typedef typename std::iterator_traits<LengthsOutputIterator>::value_type Value;
-
-        typedef int         Offset;                     // Signed integer type for global offsets
-        typedef NullType*   FlagIterator;               // Flag iterator type (not used)
-        typedef NullType    SelectOp;                   // Selection op (not used)
-        typedef Equality    EqualityOp;                 // Default == operator
-        typedef cub::Sum    ReductionOp;                // Value reduction operator
-
-        // Generator type for providing 1s values for run-length reduction
-        typedef ConstantInputIterator<Value, Offset> LengthsInputIterator;
-
-        Value one_val;
-        one_val = 1;
-
-        return DeviceReduceByKeyDispatch<InputIterator, UniqueOutputIterator, LengthsInputIterator, LengthsOutputIterator, NumRunsOutputIterator, EqualityOp, ReductionOp, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_unique_out,
-            LengthsInputIterator(one_val),
-            d_counts_out,
-            d_num_runs_out,
-            EqualityOp(),
-            ReductionOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
-     *
-     * \par
-     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
-     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
-     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
-     * - The total number of runs encountered is written to \p d_num_runs_out.
-     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     *
-     * \par Snippet
-     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;          // e.g., 8
-     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int          *d_num_runs_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run encoding
-     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
-     *
-     * // d_offsets_out         <-- [1, 4]
-     * // d_lengths_out         <-- [2, 3]
-     * // d_num_runs_out        <-- [2]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator            <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OffsetsOutputIterator    <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
-     * \tparam LengthsOutputIterator    <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
-     * \tparam NumRunsOutputIterator    <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
-     */
-    template <
-        typename                InputIterator,
-        typename                OffsetsOutputIterator,
-        typename                LengthsOutputIterator,
-        typename                NumRunsOutputIterator>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t NonTrivialRuns(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator           d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIterator   d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
-        LengthsOutputIterator   d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
-        NumRunsOutputIterator   d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
-        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int         Offset;                     // Signed integer type for global offsets
-        typedef Equality    EqualityOp;                 // Default == operator
-
-        return DeviceRleDispatch<InputIterator, OffsetsOutputIterator, LengthsOutputIterator, NumRunsOutputIterator, EqualityOp, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_offsets_out,
-            d_lengths_out,
-            d_num_runs_out,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_scan.cuh b/thrust/system/cuda/detail/cub/device/device_scan.cuh
deleted file mode 100644
index 2509e523b..000000000
--- a/thrust/system/cuda/detail/cub/device/device_scan.cuh
+++ /dev/null
@@ -1,419 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_scan_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory. ![](device_scan.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- * produces an output sequence where each element is computed to be the reduction
- * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
- * connotes a prefix scan with the addition operator. The term \em inclusive indicates
- * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- * the <em>i</em><sup>th</sup> output reduction.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceScan}
- *
- * \par Performance
- * \linear_performance{prefix scan}
- *
- * \par
- * The following chart illustrates DeviceScan::ExclusiveSum
- * performance across different CUDA architectures for \p int32 keys.
- * \plots_below
- *
- * \image html scan_int32.png
- *
- */
-struct DeviceScan
-{
-    /******************************************************************//**
-     * \name Exclusive scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a device-wide exclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated exclusive sum performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.
-     *
-     * \image html scan_int32.png
-     * \image html scan_int64.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix sum
-     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename        InputIterator,
-        typename        OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveSum(
-        void            *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator   d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator  d_out,                              ///< [out] Pointer to the output sequence of data items
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        // Scan data type
-        typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-        return DeviceScanDispatch<InputIterator, OutputIterator, Sum, T, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            T(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceScan::ExclusiveSum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op
-     * ...
-     *
-     * // Determine temporary device storage requirements for exclusive prefix scan
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // Allocate temporary storage for exclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run exclusive prefix min-scan
-     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
-     *
-     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator    <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIterator   <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIterator,
-        typename        OutputIterator,
-        typename        ScanOp,
-        typename        Identity>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t ExclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator   d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator  d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOp          scan_op,                            ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        Identity        identity,                           ///< [in] Identity element
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        return DeviceScanDispatch<InputIterator, OutputIterator, ScanOp, Identity, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            identity,
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix sum.
-     *
-     * \par
-     * - Supports non-commutative sum operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceScan::ExclusiveSum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;      // e.g., 7
-     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix sum
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix sum
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix sum
-     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
-     *
-     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator      <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIterator     <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     */
-    template <
-        typename            InputIterator,
-        typename            OutputIterator>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveSum(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator      d_out,                              ///< [out] Pointer to the output sequence of data items
-        int                 num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t        stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        return DeviceScanDispatch<InputIterator, OutputIterator, Sum, NullType, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            Sum(),
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
-     *
-     * \par
-     * - Supports non-commutative scan operators.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * Performance is typically similar to DeviceScan::ExclusiveSum.
-     *
-     * \par Snippet
-     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
-     *
-     * // CustomMin functor
-     * struct CustomMin
-     * {
-     *     template <typename T>
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     T operator()(const T &a, const T &b) const {
-     *         return (b < a) ? b : a;
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int          num_items;      // e.g., 7
-     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
-     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
-     * CustomMin    min_op;
-     * ...
-     *
-     * // Determine temporary device storage requirements for inclusive prefix scan
-     * void *d_temp_storage = NULL;
-     * size_t temp_storage_bytes = 0;
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // Allocate temporary storage for inclusive prefix scan
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run inclusive prefix min-scan
-     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
-     *
-     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator    <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
-     * \tparam OutputIterator   <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
-     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename        InputIterator,
-        typename        OutputIterator,
-        typename        ScanOp>
-    CUB_RUNTIME_FUNCTION
-    static cudaError_t InclusiveScan(
-        void            *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator   d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator  d_out,                              ///< [out] Pointer to the output sequence of data items
-        ScanOp          scan_op,                            ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        // Signed integer type for global offsets
-        typedef int Offset;
-
-        return DeviceScanDispatch<InputIterator, OutputIterator, ScanOp, NullType, Offset>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_out,
-            scan_op,
-            NullType(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-    //@}  end member group
-
-};
-
-/**
- * \example example_device_scan.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/device_select.cuh b/thrust/system/cuda/detail/cub/device/device_select.cuh
deleted file mode 100644
index 8ad409046..000000000
--- a/thrust/system/cuda/detail/cub/device/device_select.cuh
+++ /dev/null
@@ -1,372 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "dispatch/device_select_dispatch.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within global memory. ![](select_logo.png)
- * \ingroup DeviceModule
- *
- * \par Overview
- * These operations apply a selection criterion to selectively copy
- * items from a specified input sequence to a compact output sequence.
- *
- * \par Usage Considerations
- * \cdp_class{DeviceSelect}
- *
- * \par Performance
- * \linear_performance{select-flagged, select-if, and select-unique}
- *
- * \par
- * The following chart illustrates DeviceSelect::If
- * performance across different CUDA architectures for \p int32 items,
- * where 50% of the items are randomly selected.
- *
- * \image html select_if_int32_50_percent.png
- *
- * \par
- * The following chart illustrates DeviceSelect::Unique
- * performance across different CUDA architectures for \p int32 items
- * where segments have lengths uniformly sampled from [1,1000].
- *
- * \image html select_unique_int32_len_500.png
- *
- * \par
- * \plots_below
- *
- */
-struct DeviceSelect
-{
-    /**
-     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
-     *
-     * \par
-     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input, flags, and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
-     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [1, 4, 6, 7]
-     * // d_num_selected_out    <-- [4]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
-     * \tparam OutputIterator       <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIterator  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    FlagIterator,
-        typename                    OutputIterator,
-        typename                    NumSelectedIterator>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Flagged(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     Offset;         // Signed integer type for global offsets
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DeviceSelectDispatch<InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, SelectOp, EqualityOp, Offset, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            d_flags,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
-     *
-     * \par
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-if performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
-     * selected with 50% probability.
-     *
-     * \image html select_if_int32_50_percent.png
-     * \image html select_if_int64_50_percent.png
-     *
-     * \par
-     * The following charts are similar, but 5% selection probability:
-     *
-     * \image html select_if_int32_5_percent.png
-     * \image html select_if_int64_5_percent.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Functor type for selecting values less than some criteria
-     * struct LessThan
-     * {
-     *     int compare;
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     LessThan(int compare) : compare(compare) {}
-     *
-     *     CUB_RUNTIME_FUNCTION __forceinline__
-     *     bool operator()(const int &a) const {
-     *         return (a < compare);
-     *     }
-     * };
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int      num_items;              // e.g., 8
-     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
-     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int      *d_num_selected_out;    // e.g., [ ]
-     * LessThan select_op(7);
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
-     *
-     * // d_out                 <-- [0, 2, 3, 5, 2]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator       <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIterator  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator,
-        typename                    NumSelectedIterator,
-        typename                    SelectOp>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t If(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        SelectOp                    select_op,                      ///< [in] Unary selection operator
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     Offset;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // Flag iterator type (not used)
-        typedef NullType                EqualityOp;     // Equality operator (not used)
-
-        return DeviceSelectDispatch<InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, SelectOp, EqualityOp, Offset, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            select_op,
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-
-    /**
-     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
-     *
-     * \par
-     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
-     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
-     * - \devicestorage
-     * - \cdp
-     *
-     * \par Performance
-     * The following charts illustrate saturated select-unique performance across different
-     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
-     * lengths uniformly sampled from [1,1000].
-     *
-     * \image html select_unique_int32_len_500.png
-     * \image html select_unique_int64_len_500.png
-     *
-     * \par
-     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
-     *
-     * \image html select_unique_int32_len_5.png
-     * \image html select_unique_int64_len_5.png
-     *
-     * \par Snippet
-     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
-     *
-     * // Declare, allocate, and initialize device pointers for input and output
-     * int  num_items;              // e.g., 8
-     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
-     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
-     * int  *d_num_selected_out;    // e.g., [ ]
-     * ...
-     *
-     * // Determine temporary device storage requirements
-     * void     *d_temp_storage = NULL;
-     * size_t   temp_storage_bytes = 0;
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // Allocate temporary storage
-     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
-     *
-     * // Run selection
-     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
-     *
-     * // d_out                 <-- [0, 2, 9, 5, 8]
-     * // d_num_selected_out    <-- [5]
-     *
-     * \endcode
-     *
-     * \tparam InputIterator        <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
-     * \tparam OutputIterator       <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
-     * \tparam NumSelectedIterator  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
-     */
-    template <
-        typename                    InputIterator,
-        typename                    OutputIterator,
-        typename                    NumSelectedIterator>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Unique(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of selected data items
-        NumSelectedIterator         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
-        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
-    {
-        typedef int                     Offset;         // Signed integer type for global offsets
-        typedef NullType*               FlagIterator;   // Flag iterator type (not used)
-        typedef NullType                SelectOp;       // Selection op (not used)
-        typedef Equality                EqualityOp;     // Default == operator
-
-        return DeviceSelectDispatch<InputIterator, FlagIterator, OutputIterator, NumSelectedIterator, SelectOp, EqualityOp, Offset, false>::Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_in,
-            NULL,
-            d_out,
-            d_num_selected_out,
-            SelectOp(),
-            EqualityOp(),
-            num_items,
-            stream,
-            debug_synchronous);
-    }
-
-};
-
-/**
- * \example example_device_select_flagged.cu
- * \example example_device_select_if.cu
- * \example example_device_select_unique.cu
- */
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh
deleted file mode 100644
index 8b7178efe..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_histogram_dispatch.cuh
+++ /dev/null
@@ -1,554 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../block_sweep/block_histogram_sweep.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Initialization kernel entry point (multi-block).  Prepares queue descriptors and zeroes global counters.
- */
-template <
-    int                                             BINS,                   ///< Number of histogram bins per channel
-    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename                                        Offset,                 ///< Signed integer type for global offsets
-    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
-__launch_bounds__ (BINS, 1)
-__global__ void DeviceHistogramInitKernel(
-    GridQueue<Offset>                               grid_queue,             ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
-    Offset                                          num_samples)            ///< [in] Total number of samples \p d_samples for all channels
-{
-    d_out_histograms.array[blockIdx.x][threadIdx.x] = 0;
-    if (threadIdx.x == 0) grid_queue.FillAndResetDrain(num_samples);
-}
-
-
-/**
- * Histogram tiles kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
- */
-template <
-    typename                                        BlockHistogramSweepPolicy,  ///< Parameterized BlockHistogramSweepPolicy tuning policy type
-    int                                             BINS,                       ///< Number of histogram bins per channel
-    int                                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
-    typename                                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
-    typename                                        Offset>                     ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockHistogramSweepPolicy::BLOCK_THREADS))
-__global__ void DeviceHistogramSweepKernel(
-    InputIterator                                   d_samples,                  ///< [in] Array of sample data. The samples from different channels are assumed to be interleaved (e.g., an array of 32b pixels where each pixel consists of four RGBA 8b samples).
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,           ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][gridDim.x][BINS]</tt>
-    Offset                                          num_samples,                ///< [in] Total number of samples \p d_samples for all channels
-    GridEvenShare<Offset>                           even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    GridQueue<Offset>                               queue)                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Constants
-    enum
-    {
-        BLOCK_THREADS       = BlockHistogramSweepPolicy::BLOCK_THREADS,
-        ITEMS_PER_THREAD    = BlockHistogramSweepPolicy::ITEMS_PER_THREAD,
-        TILE_SIZE           = BLOCK_THREADS * ITEMS_PER_THREAD,
-    };
-
-    // Thread block type for compositing input tiles
-    typedef BlockHistogramSweep<BlockHistogramSweepPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset> BlockHistogramSweepT;
-
-    // Shared memory for BlockHistogramSweep
-    __shared__ typename BlockHistogramSweepT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockHistogramSweepT(temp_storage, d_samples, d_out_histograms.array).ConsumeRange(
-        num_samples,
-        even_share,
-        queue,
-        Int2Type<BlockHistogramSweepPolicy::GRID_MAPPING>());
-}
-
-
-/**
- * Aggregation kernel entry point (single-block).  Aggregates privatized threadblock histograms from a previous multi-block histogram pass.
- */
-template <
-    int                                             BINS,                   ///< Number of histogram bins per channel
-    int                                             ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
-    typename                                        HistoCounter>           ///< Integer type for counting sample occurrences per histogram bin
-__launch_bounds__ (BINS, 1)
-__global__ void DeviceHistogramAggregateKernel(
-    HistoCounter*                                   d_block_histograms,     ///< [in] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][num_threadblocks][BINS]</tt>
-    ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS>    d_out_histograms,       ///< [out] Histogram counter data having logical dimensions <tt>HistoCounter[ACTIVE_CHANNELS][BINS]</tt>
-    int                                             num_threadblocks)       ///< [in] Number of threadblock histograms per channel in \p d_block_histograms
-{
-    // Accumulate threadblock-histograms from the channel
-    HistoCounter bin_aggregate = 0;
-
-    int block_offset = blockIdx.x * (num_threadblocks * BINS);
-    int block_end = block_offset + (num_threadblocks * BINS);
-
-#if CUB_PTX_ARCH >= 200
-    #pragma unroll 32
-#endif
-    while (block_offset < block_end)
-    {
-        HistoCounter block_bin_count = d_block_histograms[block_offset + threadIdx.x];
-
-        bin_aggregate += block_bin_count;
-        block_offset += BINS;
-    }
-
-    // Output
-    d_out_histograms.array[blockIdx.x][threadIdx.x] = bin_aggregate;
-}
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
- */
-template <
-    DeviceHistogramAlgorithm        HISTO_ALGORITHM,            ///< Cooperative histogram algorithm to use
-    int                             BINS,                       ///< Number of histogram bins per channel
-    int                             CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
-    int                             ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
-    typename                        InputIterator,              ///< The input iterator type \iterator.  Must have a value type that is assignable to <tt>unsigned char</tt>
-    typename                        HistoCounter,               ///< Integer type for counting sample occurrences per histogram bin
-    typename                        Offset>                     ///< Signed integer type for global offsets
-struct DeviceHistogramDispatch
-{
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        // RangeHistoPolicy
-        typedef BlockHistogramSweepPolicy<
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 128 : 256,
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 12 : (30 / ACTIVE_CHANNELS),
-                HISTO_ALGORITHM,
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
-            RangeHistoPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        // RangeHistoPolicy
-        typedef BlockHistogramSweepPolicy<
-                128,
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 20 : (22 / ACTIVE_CHANNELS),
-                HISTO_ALGORITHM,
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? GRID_MAPPING_DYNAMIC : GRID_MAPPING_EVEN_SHARE>
-            RangeHistoPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // RangeHistoPolicy
-        typedef BlockHistogramSweepPolicy<
-                128,
-                (HISTO_ALGORITHM == DEVICE_HISTO_SORT) ? 21 : (23 / ACTIVE_CHANNELS),
-                HISTO_ALGORITHM,
-                GRID_MAPPING_DYNAMIC>
-            RangeHistoPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        // RangeHistoPolicy
-        typedef BlockHistogramSweepPolicy<
-                128,
-                7,
-                DEVICE_HISTO_SORT,        // (use sort regardless because g-atomics are unsupported and s-atomics are perf-useless)
-                GRID_MAPPING_EVEN_SHARE>
-            RangeHistoPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRangeHistoPolicy : PtxPolicy::RangeHistoPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_histogram_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_histogram_sweep_config.template Init<PtxRangeHistoPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_histogram_sweep_config.template Init<typename Policy350::RangeHistoPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_histogram_sweep_config.template Init<typename Policy300::RangeHistoPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_histogram_sweep_config.template Init<typename Policy200::RangeHistoPolicy>();
-        }
-        else
-        {
-            device_histogram_sweep_config.template Init<typename Policy100::RangeHistoPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration
-     */
-    struct KernelConfig
-    {
-        int                             block_threads;
-        int                             items_per_thread;
-        DeviceHistogramAlgorithm        block_algorithm;
-        GridMappingStrategy             grid_mapping;
-
-        template <typename BlockPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
-            block_algorithm             = BlockPolicy::HISTO_ALGORITHM;
-            grid_mapping                = BlockPolicy::GRID_MAPPING;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d", block_threads, items_per_thread, block_algorithm, grid_mapping);
-        }
-
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-
-    /**
-     * Internal dispatch routine
-     */
-    template <
-        typename                            InitHistoKernelPtr,                 ///< Function type of cub::DeviceHistogramInitKernel
-        typename                            DeviceHistogramSweepKernelPtr,      ///< Function type of cub::DeviceHistogramSweepKernel
-        typename                            SingleHistogramPartialsKernelPtr>   ///< Function type of cub::DeviceHistogramAggregateKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator                       d_samples,                          ///< [in] Pointer to the input sequence of samples to histogram
-        HistoCounter                        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
-        Offset                              num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t                        stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                                debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Default is \p false.
-        InitHistoKernelPtr                  init_kernel,                        ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
-        DeviceHistogramSweepKernelPtr       device_histogram_sweep_kernel,      ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
-        SingleHistogramPartialsKernelPtr    single_histogram_partials_kernel,   ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramAggregateKernel
-        KernelConfig                        device_histogram_sweep_config)      ///< [in] Dispatch parameters that match the policy that \p device_histogram_sweep_kernel was compiled for
-    {
-    #ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-    #else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get SM occupancy for device_histogram_sweep_kernel
-            int histo_range_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                histo_range_sm_occupancy,
-                sm_version,
-                device_histogram_sweep_kernel,
-                device_histogram_sweep_config.block_threads))) break;
-
-            // Get device occupancy for device_histogram_sweep_kernel
-            int histo_range_occupancy = histo_range_sm_occupancy * sm_count;
-
-            // Get tile size for device_histogram_sweep_kernel
-            int channel_tile_size = device_histogram_sweep_config.block_threads * device_histogram_sweep_config.items_per_thread;
-            int tile_size = channel_tile_size * CHANNELS;
-
-            // Even-share work distribution
-            int subscription_factor = histo_range_sm_occupancy;     // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
-            GridEvenShare<Offset> even_share(
-                num_samples,
-                histo_range_occupancy * subscription_factor,
-                tile_size);
-
-            // Get grid size for device_histogram_sweep_kernel
-            int histo_range_grid_size;
-            switch (device_histogram_sweep_config.grid_mapping)
-            {
-            case GRID_MAPPING_EVEN_SHARE:
-
-                // Work is distributed evenly
-                histo_range_grid_size = even_share.grid_size;
-                break;
-
-            case GRID_MAPPING_DYNAMIC:
-
-                // Work is distributed dynamically
-                int num_tiles               = (num_samples + tile_size - 1) / tile_size;
-                histo_range_grid_size   = (num_tiles < histo_range_occupancy) ?
-                    num_tiles :                     // Not enough to fill the device with threadblocks
-                    histo_range_occupancy;      // Fill the device with threadblocks
-                break;
-            };
-
-            // Temporary storage allocation requirements
-            void* allocations[2];
-            size_t allocation_sizes[2] =
-            {
-                ACTIVE_CHANNELS * histo_range_grid_size * sizeof(HistoCounter) * BINS,      // bytes needed for privatized histograms
-                GridQueue<int>::AllocationSize()                                                // bytes needed for grid queue descriptor
-            };
-
-            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Alias the allocation for the privatized per-block reductions
-            HistoCounter *d_block_histograms = (HistoCounter*) allocations[0];
-
-            // Alias the allocation for the grid queue descriptor
-            GridQueue<Offset> queue(allocations[1]);
-
-            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_histo_wrapper;
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                d_histo_wrapper.array[CHANNEL] = d_histograms[CHANNEL];
-
-            // Setup array wrapper for temporary histogram channel output (because we can't pass static arrays as kernel parameters)
-            ArrayWrapper<HistoCounter*, ACTIVE_CHANNELS> d_temp_histo_wrapper;
-            for (int CHANNEL = 0; CHANNEL < ACTIVE_CHANNELS; ++CHANNEL)
-                d_temp_histo_wrapper.array[CHANNEL] = d_block_histograms + (CHANNEL * histo_range_grid_size * BINS);
-
-            // Log init_kernel configuration
-            if (debug_synchronous) CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", ACTIVE_CHANNELS, BINS, (long long) stream);
-
-            // Invoke init_kernel to initialize counters and queue descriptor
-            init_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(queue, d_histo_wrapper, num_samples);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Whether we need privatized histograms (i.e., non-global atomics and multi-block)
-            bool privatized_temporaries = (histo_range_grid_size > 1) && (device_histogram_sweep_config.block_algorithm != DEVICE_HISTO_GLOBAL_ATOMIC);
-
-            // Log device_histogram_sweep_kernel configuration
-            if (debug_synchronous) CubLog("Invoking device_histogram_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                histo_range_grid_size, device_histogram_sweep_config.block_threads, (long long) stream, device_histogram_sweep_config.items_per_thread, histo_range_sm_occupancy);
-
-            // Invoke device_histogram_sweep_kernel
-            device_histogram_sweep_kernel<<<histo_range_grid_size, device_histogram_sweep_config.block_threads, 0, stream>>>(
-                d_samples,
-                (privatized_temporaries) ?
-                    d_temp_histo_wrapper :
-                    d_histo_wrapper,
-                num_samples,
-                even_share,
-                queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Aggregate privatized block histograms if necessary
-            if (privatized_temporaries)
-            {
-                // Log single_histogram_partials_kernel configuration
-                if (debug_synchronous) CubLog("Invoking single_histogram_partials_kernel<<<%d, %d, 0, %lld>>>()\n",
-                    ACTIVE_CHANNELS, BINS, (long long) stream);
-
-                // Invoke single_histogram_partials_kernel
-                single_histogram_partials_kernel<<<ACTIVE_CHANNELS, BINS, 0, stream>>>(
-                    d_block_histograms,
-                    d_histo_wrapper,
-                    histo_range_grid_size);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-    #endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator       d_samples,                          ///< [in] Pointer to the input sequence of samples to histogram
-        HistoCounter        *d_histograms[ACTIVE_CHANNELS],     ///< [out] Array of channel histograms, each having BINS counters of integral type \p HistoCounter.
-        int                 num_samples,                        ///< [in] Number of samples to process
-        cudaStream_t        stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_histogram_sweep_config;
-            InitConfigs(ptx_version, device_histogram_sweep_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_samples,
-                d_histograms,
-                num_samples,
-                stream,
-                debug_synchronous,
-                DeviceHistogramInitKernel<BINS, ACTIVE_CHANNELS, Offset, HistoCounter>,
-                DeviceHistogramSweepKernel<PtxRangeHistoPolicy, BINS, CHANNELS, ACTIVE_CHANNELS, InputIterator, HistoCounter, Offset>,
-                DeviceHistogramAggregateKernel<BINS, ACTIVE_CHANNELS, HistoCounter>,
-                device_histogram_sweep_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh
deleted file mode 100644
index b800e4dc1..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_radix_sort_dispatch.cuh
+++ /dev/null
@@ -1,944 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../block_sweep/block_radix_sort_upsweep.cuh"
-#include "../../block_sweep/block_radix_sort_downsweep.cuh"
-#include "../../block_sweep/block_scan_sweep.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Upsweep pass kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
- */
-template <
-    typename                BlockRadixSortUpsweepPolicy,        ///< Parameterized BlockRadixSortUpsweepPolicy tuning policy type
-    bool                    DESCENDING,                         ///< Whether or not the sorted-order is high-to-low
-    typename                Key,                                ///< Key type
-    typename                Offset>                             ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockRadixSortUpsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortUpsweepKernel(
-    Key                     *d_keys,                            ///< [in] Input keys buffer
-    Offset                  *d_spine,                           ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    Offset                  num_items,                          ///< [in] Total number of input data items
-    int                     current_bit,                        ///< [in] Bit position of current radix digit
-    int                     num_bits,                           ///< [in] Number of bits of current radix digit
-    bool                    first_pass,                         ///< [in] Whether this is the first digit pass
-    GridEvenShare<Offset>   even_share)                         ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    // Parameterize BlockRadixSortUpsweep type for the current configuration
-    typedef BlockRadixSortUpsweep<BlockRadixSortUpsweepPolicy, Key, Offset> BlockRadixSortUpsweepT;          // Primary
-
-    // Shared memory storage
-    __shared__ typename BlockRadixSortUpsweepT::TempStorage temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.BlockInit();
-
-    Offset bin_count;
-    BlockRadixSortUpsweepT(temp_storage, d_keys, current_bit, num_bits).ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end,
-        bin_count);
-
-    // Write out digit counts (striped)
-    if (threadIdx.x < BlockRadixSortUpsweepT::RADIX_DIGITS)
-    {
-        int bin_idx = (DESCENDING) ?
-            BlockRadixSortUpsweepT::RADIX_DIGITS - threadIdx.x - 1 :
-            threadIdx.x;
-
-        d_spine[(gridDim.x * bin_idx) + blockIdx.x] = bin_count;
-    }
-}
-
-
-/**
- * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
- */
-template <
-    typename    BlockScanSweepPolicy,       ///< Parameterizable tuning policy type for cub::BlockScanSweep abstraction
-    typename    Offset>                     ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockScanSweepPolicy::BLOCK_THREADS), 1)
-__global__ void RadixSortScanBinsKernel(
-    Offset      *d_spine,                   ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    int         num_counts)                 ///< [in] Total number of bin-counts
-{
-    // Parameterize the BlockScanSweep type for the current configuration
-    typedef BlockScanSweep<BlockScanSweepPolicy, Offset*, Offset*, cub::Sum, Offset, Offset> BlockScanSweepT;
-
-    // Shared memory storage
-    __shared__ typename BlockScanSweepT::TempStorage temp_storage;
-
-    if (blockIdx.x > 0) return;
-
-    // Block scan instance
-    BlockScanSweepT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), Offset(0)) ;
-
-    // Process full input tiles
-    int block_offset = 0;
-    BlockScanRunningPrefixOp<Offset, Sum> prefix_op(0, Sum());
-    while (block_offset + BlockScanSweepT::TILE_ITEMS <= num_counts)
-    {
-        block_scan.ConsumeTile<true, false>(block_offset, prefix_op);
-        block_offset += BlockScanSweepT::TILE_ITEMS;
-    }
-}
-
-
-/**
- * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
- */
-template <
-    typename                BlockRadixSortDownsweepPolicy,          ///< Parameterizable tuning policy type for cub::BlockRadixSortUpsweep abstraction
-    bool                    DESCENDING,                             ///< Whether or not the sorted-order is high-to-low
-    typename                Key,                                    ///< Key type
-    typename                Value,                                  ///< Value type
-    typename                Offset>                                 ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockRadixSortDownsweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRadixSortDownsweepKernel(
-    Key                     *d_keys_in,                             ///< [in] Input keys ping buffer
-    Key                     *d_keys_out,                            ///< [in] Output keys pong buffer
-    Value                   *d_values_in,                           ///< [in] Input values ping buffer
-    Value                   *d_values_out,                          ///< [in] Output values pong buffer
-    Offset                  *d_spine,                               ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
-    Offset                  num_items,                              ///< [in] Total number of input data items
-    int                     current_bit,                            ///< [in] Bit position of current radix digit
-    int                     num_bits,                               ///< [in] Number of bits of current radix digit
-    bool                    first_pass,                             ///< [in] Whether this is the first digit pass
-    bool                    last_pass,                              ///< [in] Whether this is the last digit pass
-    GridEvenShare<Offset>   even_share)                             ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-{
-    // Parameterize BlockRadixSortDownsweep type for the current configuration
-    typedef BlockRadixSortDownsweep<BlockRadixSortDownsweepPolicy, DESCENDING, Key, Value, Offset> BlockRadixSortDownsweepT;
-
-    // Shared memory storage
-    __shared__  typename BlockRadixSortDownsweepT::TempStorage temp_storage;
-
-    // Initialize even-share descriptor for this thread block
-    even_share.BlockInit();
-
-    // Process input tiles
-    BlockRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
-        even_share.block_offset,
-        even_share.block_end);
-}
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceRadixSort
- */
-template <
-    bool     DESCENDING,        ///< Whether or not the sorted-order is high-to-low
-    typename Key,            ///< Key type
-    typename Value,          ///< Value type
-    typename Offset>         ///< Signed integer type for global offsets
-struct DeviceRadixSortDispatch
-{
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // Primary UpsweepPolicy
-        typedef BlockRadixSortUpsweepPolicy <64,     CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-
-        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortUpsweepPolicy <64,     CUB_MAX(1, 22 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128,    CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_LDG, RADIX_BITS - 1> AltUpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanSweepPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // Primary DownsweepPolicy
-        typedef BlockRadixSortDownsweepPolicy <64,   CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortDownsweepPolicy <128,  CUB_MAX(1, 11 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128,  CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_DIRECT, LOAD_LDG, false, true, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-    };
-
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-
-        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanSweepPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeEightByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-    };
-
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-
-        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanSweepPolicy <512, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-    };
-
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            KEYS_ONLY       = (Equals<Value, NullType>::VALUE),
-            SCALE_FACTOR    = (CUB_MAX(sizeof(Key), sizeof(Value)) + 3) / 4,
-            RADIX_BITS      = 5,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 19 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS> UpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type UpsweepPolicy;
-
-        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyKeys;
-        typedef BlockRadixSortUpsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type AltUpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanSweepPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <64, CUB_MAX(1, 19 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
-
-        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyKeys;
-        typedef BlockRadixSortDownsweepPolicy <128, CUB_MAX(1, 15 / SCALE_FACTOR), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicyPairs;
-        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
-    };
-
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            RADIX_BITS = 4,
-        };
-
-        // UpsweepPolicy
-        typedef BlockRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS> UpsweepPolicy;
-
-        // Alternate UpsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortUpsweepPolicy <64, 9, LOAD_DEFAULT, RADIX_BITS - 1> AltUpsweepPolicy;
-
-        // ScanPolicy
-        typedef BlockScanSweepPolicy <256, 4, BLOCK_LOAD_VECTORIZE, false, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, false, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
-
-        // DownsweepPolicy
-        typedef BlockRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS> DownsweepPolicy;
-
-        // Alternate DownsweepPolicy for (RADIX_BITS-1)-bit passes
-        typedef BlockRadixSortDownsweepPolicy <64, 9, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, false, false, BLOCK_SCAN_WARP_SCANS, RADIX_SORT_SCATTER_TWO_PHASE, cudaSharedMemBankSizeFourByte, RADIX_BITS - 1> AltDownsweepPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxUpsweepPolicy         : PtxPolicy::UpsweepPolicy {};
-    struct PtxAltUpsweepPolicy      : PtxPolicy::AltUpsweepPolicy {};
-    struct PtxScanPolicy            : PtxPolicy::ScanPolicy {};
-    struct PtxDownsweepPolicy       : PtxPolicy::DownsweepPolicy {};
-    struct PtxAltDownsweepPolicy    : PtxPolicy::AltDownsweepPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <
-        typename Policy,
-        typename KernelConfig,
-        typename UpsweepKernelPtr,          ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename ScanKernelPtr,            ///< Function type of cub::SpineScanKernel
-        typename DownsweepKernelPtr>        ///< Function type of cub::DeviceRadixSortUpsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t InitConfigs(
-        int                     sm_version,
-        int                     sm_count,
-        KernelConfig            &upsweep_config,
-        KernelConfig            &alt_upsweep_config,
-        KernelConfig            &scan_config,
-        KernelConfig            &downsweep_config,
-        KernelConfig            &alt_downsweep_config,
-        UpsweepKernelPtr        upsweep_kernel,
-        UpsweepKernelPtr        alt_upsweep_kernel,
-        ScanKernelPtr           scan_kernel,
-        DownsweepKernelPtr      downsweep_kernel,
-        DownsweepKernelPtr      alt_downsweep_kernel)
-    {
-        cudaError_t error;
-        do {
-            if (CubDebug(error = upsweep_config.template         InitUpsweepPolicy<typename Policy::UpsweepPolicy>(         sm_version, sm_count, upsweep_kernel))) break;
-            if (CubDebug(error = alt_upsweep_config.template     InitUpsweepPolicy<typename Policy::AltUpsweepPolicy>(      sm_version, sm_count, alt_upsweep_kernel))) break;
-            if (CubDebug(error = scan_config.template            InitScanPolicy<typename Policy::ScanPolicy>(               sm_version, sm_count, scan_kernel))) break;
-            if (CubDebug(error = downsweep_config.template       InitDownsweepPolicy<typename Policy::DownsweepPolicy>(     sm_version, sm_count, downsweep_kernel))) break;
-            if (CubDebug(error = alt_downsweep_config.template   InitDownsweepPolicy<typename Policy::AltDownsweepPolicy>(  sm_version, sm_count, alt_downsweep_kernel))) break;
-
-        } while (0);
-
-        return error;
-    }
-
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <
-        typename KernelConfig,
-        typename UpsweepKernelPtr,          ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename ScanKernelPtr,            ///< Function type of cub::SpineScanKernel
-        typename DownsweepKernelPtr>        ///< Function type of cub::DeviceRadixSortUpsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t InitConfigs(
-        int                     ptx_version,
-        int                     sm_version,
-        int                     sm_count,
-        KernelConfig            &upsweep_config,
-        KernelConfig            &alt_upsweep_config,
-        KernelConfig            &scan_config,
-        KernelConfig            &downsweep_config,
-        KernelConfig            &alt_downsweep_config,
-        UpsweepKernelPtr        upsweep_kernel,
-        UpsweepKernelPtr        alt_upsweep_kernel,
-        ScanKernelPtr          scan_kernel,
-        DownsweepKernelPtr      downsweep_kernel,
-        DownsweepKernelPtr      alt_downsweep_kernel)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        cudaError_t error;
-        do {
-
-            if (CubDebug(error = upsweep_config.template InitUpsweepPolicy<PtxUpsweepPolicy>(               sm_version, sm_count, upsweep_kernel))) break;
-            if (CubDebug(error = alt_upsweep_config.template InitUpsweepPolicy<PtxAltUpsweepPolicy>(        sm_version, sm_count, alt_upsweep_kernel))) break;
-            if (CubDebug(error = scan_config.template InitScanPolicy<PtxScanPolicy>(                        sm_version, sm_count, scan_kernel))) break;
-            if (CubDebug(error = downsweep_config.template InitDownsweepPolicy<PtxDownsweepPolicy>(         sm_version, sm_count, downsweep_kernel))) break;
-            if (CubDebug(error = alt_downsweep_config.template InitDownsweepPolicy<PtxAltDownsweepPolicy>(  sm_version, sm_count, alt_downsweep_kernel))) break;
-
-        } while (0);
-
-        return error;
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        cudaError_t error;
-        if (ptx_version >= 350)
-        {
-            error = InitConfigs<Policy350>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
-        }
-        else if (ptx_version >= 300)
-        {
-            error = InitConfigs<Policy300>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
-        }
-        else if (ptx_version >= 200)
-        {
-            error = InitConfigs<Policy200>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
-        }
-        else if (ptx_version >= 130)
-        {
-            error = InitConfigs<Policy130>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
-        }
-        else
-        {
-            error = InitConfigs<Policy100>(sm_version, sm_count, upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config, upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel);
-        }
-
-        return error;
-
-    #endif
-    }
-
-
-
-    /**
-     * Kernel kernel dispatch configurations
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        int                     tile_size;
-        cudaSharedMemConfig     smem_config;
-        int                     radix_bits;
-        int                     sm_occupancy;
-        int                     max_grid_size;
-        int                     subscription_factor;
-
-        CUB_RUNTIME_FUNCTION __forceinline__ KernelConfig()
-          : block_threads(0), items_per_thread(0), tile_size(0), smem_config(cudaSharedMemBankSizeDefault), radix_bits(0), sm_occupancy(0), max_grid_size(0), subscription_factor(0)
-        {
-        }
-
-        template <typename UpsweepPolicy, typename UpsweepKernelPtr>
-        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitUpsweepPolicy(
-            int sm_version, int sm_count, UpsweepKernelPtr upsweep_kernel)
-        {
-            block_threads               = UpsweepPolicy::BLOCK_THREADS;
-            items_per_thread            = UpsweepPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = UpsweepPolicy::RADIX_BITS;
-            smem_config                 = cudaSharedMemBankSizeFourByte;
-            tile_size                   = block_threads * items_per_thread;
-            cudaError_t retval          = MaxSmOccupancy(sm_occupancy, sm_version, upsweep_kernel, block_threads);
-            subscription_factor         = CUB_SUBSCRIPTION_FACTOR(sm_version);
-            max_grid_size               = (sm_occupancy * sm_count) * subscription_factor;
-
-            return retval;
-        }
-
-        template <typename ScanPolicy, typename ScanKernelPtr>
-        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitScanPolicy(
-            int sm_version, int sm_count, ScanKernelPtr scan_kernel)
-        {
-            block_threads               = ScanPolicy::BLOCK_THREADS;
-            items_per_thread            = ScanPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = 0;
-            smem_config                 = cudaSharedMemBankSizeFourByte;
-            tile_size                   = block_threads * items_per_thread;
-            sm_occupancy                = 1;
-            subscription_factor         = 1;
-            max_grid_size               = 1;
-
-            return cudaSuccess;
-        }
-
-        template <typename DownsweepPolicy, typename DownsweepKernelPtr>
-        CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t InitDownsweepPolicy(
-            int sm_version, int sm_count, DownsweepKernelPtr downsweep_kernel)
-        {
-            block_threads               = DownsweepPolicy::BLOCK_THREADS;
-            items_per_thread            = DownsweepPolicy::ITEMS_PER_THREAD;
-            radix_bits                  = DownsweepPolicy::RADIX_BITS;
-            smem_config                 = DownsweepPolicy::SMEM_CONFIG;
-            tile_size                   = block_threads * items_per_thread;
-            cudaError_t retval          = MaxSmOccupancy(sm_occupancy, sm_version, downsweep_kernel, block_threads);
-            subscription_factor         = CUB_SUBSCRIPTION_FACTOR(sm_version);
-            max_grid_size               = (sm_occupancy * sm_count) * subscription_factor;
-
-            return retval;
-        }
-    };
-
-
-    /******************************************************************************
-     * Allocation of device temporaries
-     ******************************************************************************/
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t AllocateTemporaries(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        Offset*                 &d_spine,                       ///< [out] Digit count histograms per thread block
-        KernelConfig            &scan_config,                   ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-        KernelConfig            &downsweep_config)              ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get spine size (conservative)
-            int spine_size = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
-
-            // Temporary storage allocation requirements
-            void* allocations[1];
-            size_t allocation_sizes[1] =
-            {
-                spine_size * sizeof(Offset),    // bytes needed for privatized block digit histograms
-            };
-
-            // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Alias the allocation for the privatized per-block digit histograms
-            d_spine = (Offset*) allocations[0];
-
-        } while(0);
-
-        return error;
-    }
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide radix sort using the
-     * specified kernel functions.
-     */
-    template <
-        typename                UpsweepKernelPtr,               ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename                ScanKernelPtr,                  ///< Function type of cub::SpineScanKernel
-        typename                DownsweepKernelPtr>             ///< Function type of cub::DeviceRadixSortUpsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        Offset                  *d_spine,                       ///< [in] Digit count histograms per thread block
-        int                     spine_size,                     ///< [in] Number of histogram counters
-        Offset                  num_items,                      ///< [in] Number of items to reduce
-        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        KernelConfig            &upsweep_config,                ///< [in] Dispatch parameters that match the policy that \p upsweep_kernel was compiled for
-        KernelConfig            &scan_config,                   ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
-        KernelConfig            &downsweep_config,              ///< [in] Dispatch parameters that match the policy that \p downsweep_kernel was compiled for
-        UpsweepKernelPtr        upsweep_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelPtr           scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelPtr      downsweep_kernel)               ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get even-share work distribution descriptor
-            GridEvenShare<Offset> even_share(num_items, downsweep_config.max_grid_size, CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
-
-#if (CUB_PTX_ARCH == 0)
-            // Get current smem bank configuration
-            cudaSharedMemConfig original_smem_config;
-            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
-            cudaSharedMemConfig current_smem_config = original_smem_config;
-#endif
-            // Iterate over digit places
-            int current_bit = begin_bit;
-            while (current_bit < end_bit)
-            {
-                int num_bits = CUB_MIN(end_bit - current_bit, downsweep_config.radix_bits);
-
-#if (CUB_PTX_ARCH == 0)
-                // Update smem config if necessary
-                if (current_smem_config != upsweep_config.smem_config)
-                {
-                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(upsweep_config.smem_config))) break;
-                    current_smem_config = upsweep_config.smem_config;
-                }
-#endif
-
-                // Log upsweep_kernel configuration
-                if (debug_synchronous)
-                    CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy, selector %d, current bit %d, bit_grain %d\n",
-                    even_share.grid_size, upsweep_config.block_threads, (long long) stream, upsweep_config.smem_config, upsweep_config.items_per_thread, upsweep_config.sm_occupancy, d_keys.selector, current_bit, downsweep_config.radix_bits);
-
-                // Invoke upsweep_kernel with same grid size as downsweep_kernel
-                upsweep_kernel<<<even_share.grid_size, upsweep_config.block_threads, 0, stream>>>(
-                    d_keys.d_buffers[d_keys.selector],
-                    d_spine,
-                    num_items,
-                    current_bit,
-                    num_bits,
-                    (current_bit == begin_bit),
-                    even_share);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Log scan_kernel configuration
-                if (debug_synchronous) CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, scan_config.block_threads, (long long) stream, scan_config.items_per_thread);
-
-                // Invoke scan_kernel
-                scan_kernel<<<1, scan_config.block_threads, 0, stream>>>(
-                    d_spine,
-                    spine_size);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-
-#if (CUB_PTX_ARCH == 0)
-                // Update smem config if necessary
-                if (current_smem_config != downsweep_config.smem_config)
-                {
-                    if (CubDebug(error = cudaDeviceSetSharedMemConfig(downsweep_config.smem_config))) break;
-                    current_smem_config = downsweep_config.smem_config;
-                }
-#endif
-                // Log downsweep_kernel configuration
-                if (debug_synchronous) CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d smem config, %d items per thread, %d SM occupancy\n",
-                    even_share.grid_size, downsweep_config.block_threads, (long long) stream, downsweep_config.smem_config, downsweep_config.items_per_thread, downsweep_config.sm_occupancy);
-
-                // Invoke downsweep_kernel
-                downsweep_kernel<<<even_share.grid_size, downsweep_config.block_threads, 0, stream>>>(
-                    d_keys.d_buffers[d_keys.selector],
-                    d_keys.d_buffers[d_keys.selector ^ 1],
-                    d_values.d_buffers[d_values.selector],
-                    d_values.d_buffers[d_values.selector ^ 1],
-                    d_spine,
-                    num_items,
-                    current_bit,
-                    num_bits,
-                    (current_bit == begin_bit),
-                    (current_bit + downsweep_config.radix_bits >= end_bit),
-                    even_share);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Invert selectors
-                d_keys.selector ^= 1;
-                d_values.selector ^= 1;
-
-                // Update current bit position
-                current_bit += downsweep_config.radix_bits;
-            }
-
-#if (CUB_PTX_ARCH == 0)
-            // Reset smem config if necessary
-            if (current_smem_config != original_smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
-            }
-#endif
-
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    template <
-        typename UpsweepKernelPtr,          ///< Function type of cub::DeviceRadixSortUpsweepKernel
-        typename ScanKernelPtr,             ///< Function type of cub::SpineScanKernel
-        typename DownsweepKernelPtr>        ///< Function type of cub::DeviceRadixSortUpsweepKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        Offset                  num_items,                      ///< [in] Number of items to reduce
-        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        UpsweepKernelPtr        upsweep_kernel,                 ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        UpsweepKernelPtr        alt_upsweep_kernel,             ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        ScanKernelPtr           scan_kernel,                    ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
-        DownsweepKernelPtr      downsweep_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-        DownsweepKernelPtr      alt_downsweep_kernel)           ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-
-        cudaError error = cudaSuccess;
-
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig upsweep_config;
-            KernelConfig alt_upsweep_config;
-            KernelConfig scan_config;
-            KernelConfig downsweep_config;
-            KernelConfig alt_downsweep_config;
-
-            if (CubDebug(error = InitConfigs(ptx_version, sm_version, sm_count,
-                upsweep_config, alt_upsweep_config, scan_config, downsweep_config, alt_downsweep_config,
-                upsweep_kernel, alt_upsweep_kernel, scan_kernel, downsweep_kernel, alt_downsweep_kernel))) break;
-
-            // Get spine sizes (conservative)
-            int spine_size      = (downsweep_config.max_grid_size * (1 << downsweep_config.radix_bits)) + scan_config.tile_size;
-            int alt_spine_size  = (alt_downsweep_config.max_grid_size * (1 << alt_downsweep_config.radix_bits)) + scan_config.tile_size;
-
-            // Allocate temporaries
-            Offset *d_spine = 0;
-            if (spine_size > alt_spine_size)
-            {
-                if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, downsweep_config))) break;
-            }
-            else
-            {
-                if (CubDebug(error = AllocateTemporaries(d_temp_storage, temp_storage_bytes, d_spine, scan_config, alt_downsweep_config))) break;
-            }
-
-            // Return if the caller is simply requesting the size of the storage allocation
-            if (d_temp_storage == NULL)
-                return cudaSuccess;
-
-            // Run radix sorting passes
-            int num_bits = end_bit - begin_bit;
-            int remaining_bits = num_bits % downsweep_config.radix_bits;
-
-            if (remaining_bits != 0)
-            {
-                // Run passes of alternate configuration
-                int max_alt_passes  = downsweep_config.radix_bits - remaining_bits;
-                int alt_end_bit     = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_downsweep_config.radix_bits));
-
-                if (CubDebug(error = Dispatch(
-                    d_keys,
-                    d_values,
-                    d_spine,
-                    alt_spine_size,
-                    num_items,
-                    begin_bit,
-                    alt_end_bit,
-                    stream,
-                    debug_synchronous,
-                    alt_upsweep_config,
-                    scan_config,
-                    alt_downsweep_config,
-                    alt_upsweep_kernel,
-                    scan_kernel,
-                    alt_downsweep_kernel))) break;
-
-                begin_bit = alt_end_bit;
-            }
-
-            // Run passes of primary configuration
-            if (CubDebug(error = Dispatch(
-                d_keys,
-                d_values,
-                d_spine,
-                spine_size,
-                num_items,
-                begin_bit,
-                end_bit,
-                stream,
-                debug_synchronous,
-                upsweep_config,
-                scan_config,
-                downsweep_config,
-                upsweep_kernel,
-                scan_kernel,
-                downsweep_kernel))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                    *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        DoubleBuffer<Key>       &d_keys,                        ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
-        DoubleBuffer<Value>     &d_values,                      ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
-        Offset                  num_items,                      ///< [in] Number of items to reduce
-        int                     begin_bit,                      ///< [in] The beginning (least-significant) bit index needed for key comparison
-        int                     end_bit,                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
-        cudaStream_t            stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                    debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        return Dispatch(
-            d_temp_storage,
-            temp_storage_bytes,
-            d_keys,
-            d_values,
-            num_items,
-            begin_bit,
-            end_bit,
-            stream,
-            debug_synchronous,
-            DeviceRadixSortUpsweepKernel<PtxUpsweepPolicy, DESCENDING, Key, Offset>,
-            DeviceRadixSortUpsweepKernel<PtxAltUpsweepPolicy, DESCENDING, Key, Offset>,
-            RadixSortScanBinsKernel<PtxScanPolicy, Offset>,
-            DeviceRadixSortDownsweepKernel<PtxDownsweepPolicy, DESCENDING, Key, Value, Offset>,
-            DeviceRadixSortDownsweepKernel<PtxAltDownsweepPolicy, DESCENDING, Key, Value, Offset>);
-    }
-
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
deleted file mode 100644
index 7ad75290c..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_by_key_dispatch.cuh
+++ /dev/null
@@ -1,592 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "device_scan_dispatch.cuh"
-#include "../../block_sweep/block_reduce_by_key_sweep.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Multi-block reduce-by-key sweep kernel entry point
- */
-template <
-    typename            BlockReduceSweepByKeyPolicy,            ///< Parameterized BlockReduceSweepByKeyPolicy tuning policy type
-    typename            KeysInputIterator,                      ///< Random-access input iterator type for keys
-    typename            UniqueOutputIterator,                   ///< Random-access output iterator type for keys
-    typename            ValuesInputIterator,                    ///< Random-access input iterator type for values
-    typename            AggregatesOutputIterator,               ///< Random-access output iterator type for values
-    typename            NumRunsOutputIterator,                  ///< Output iterator type for recording number of segments encountered
-    typename            ScanTileState,                          ///< Tile status interface type
-    typename            EqualityOp,                             ///< Key equality operator type
-    typename            ReductionOp,                            ///< Value reduction operator type
-    typename            Offset>                                 ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockReduceSweepByKeyPolicy::BLOCK_THREADS))
-__global__ void DeviceReduceByKeySweepKernel(
-    KeysInputIterator           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-    UniqueOutputIterator        d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-    ValuesInputIterator         d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-    AggregatesOutputIterator    d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-    NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-    ScanTileState               tile_status,                    ///< [in] Tile status interface
-    EqualityOp                  equality_op,                    ///< [in] Key equality operator
-    ReductionOp                 reduction_op,                   ///< [in] Value reduction operator
-    Offset                      num_items,                      ///< [in] Total number of items to select from
-    int                         num_tiles,                      ///< [in] Total number of tiles for the entire problem
-    GridQueue<int>              queue)                          ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for reducing tiles of value segments
-    typedef BlockReduceSweepByKey<
-        BlockReduceSweepByKeyPolicy,
-        KeysInputIterator,
-        UniqueOutputIterator,
-        ValuesInputIterator,
-        AggregatesOutputIterator,
-        EqualityOp,
-        ReductionOp,
-        Offset> BlockReduceSweepByKeyT;
-
-    // Shared memory for BlockReduceSweepByKey
-    __shared__ typename BlockReduceSweepByKeyT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockReduceSweepByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, equality_op, reduction_op, num_items).ConsumeRange(
-        num_tiles,
-        queue,
-        tile_status,
-        d_num_runs_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
- */
-template <
-    typename    KeysInputIterator,               ///< Random-access input iterator type for keys
-    typename    UniqueOutputIterator,              ///< Random-access output iterator type for keys
-    typename    ValuesInputIterator,             ///< Random-access input iterator type for values
-    typename    AggregatesOutputIterator,            ///< Random-access output iterator type for values
-    typename    NumRunsOutputIterator,            ///< Output iterator type for recording number of segments encountered
-    typename    EqualityOp,                     ///< Key equality operator type
-    typename    ReductionOp,                    ///< Value reduction operator type
-    typename    Offset>                         ///< Signed integer type for global offsets
-struct DeviceReduceByKeyDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // Data type of key input iterator
-    typedef typename std::iterator_traits<KeysInputIterator>::value_type Key;
-
-    // Data type of value input iterator
-    typedef typename std::iterator_traits<ValuesInputIterator>::value_type Value;
-
-    enum
-    {
-        INIT_KERNEL_THREADS     = 128,
-        MAX_INPUT_BYTES         = CUB_MAX(sizeof(Key), sizeof(Value)),
-        COMBINED_INPUT_BYTES    = sizeof(Key) + sizeof(Value),
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<Value, Offset> ScanTileState;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 8,
-            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 8 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef BlockReduceSweepByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 6,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef BlockReduceSweepByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 13,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef BlockReduceSweepByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 7,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef BlockReduceSweepByKeyPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            ReduceByKeyPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
-        };
-
-        typedef BlockReduceSweepByKeyPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING>
-            ReduceByKeyPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_reduce_by_key_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_reduce_by_key_sweep_config.template Init<PtxReduceByKeyPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_reduce_by_key_sweep_config.template Init<typename Policy350::ReduceByKeyPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_reduce_by_key_sweep_config.template Init<typename Policy300::ReduceByKeyPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_reduce_by_key_sweep_config.template Init<typename Policy200::ReduceByKeyPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_reduce_by_key_sweep_config.template Init<typename Policy130::ReduceByKeyPolicy>();
-        }
-        else
-        {
-            device_reduce_by_key_sweep_config.template Init<typename Policy100::ReduceByKeyPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockReduceSweepByKeyPolicy.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    two_phase_scatter;
-        BlockScanAlgorithm      scan_algorithm;
-        cudaSharedMemConfig     smem_config;
-
-        template <typename BlockReduceSweepByKeyPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockReduceSweepByKeyPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockReduceSweepByKeyPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockReduceSweepByKeyPolicy::LOAD_ALGORITHM;
-            two_phase_scatter           = BlockReduceSweepByKeyPolicy::TWO_PHASE_SCATTER;
-            scan_algorithm              = BlockReduceSweepByKeyPolicy::SCAN_ALGORITHM;
-            smem_config                 = cudaSharedMemBankSizeEightByte;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                two_phase_scatter,
-                scan_algorithm,
-                smem_config);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduce-by-key using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,                ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceReduceByKeySweepKernelPtr>        ///< Function type of cub::DeviceReduceByKeySweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                            *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIterator               d_keys_in,                          ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIterator            d_unique_out,                       ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIterator             d_values_in,                        ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIterator        d_aggregates_out,                   ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIterator           d_num_runs_out,                         ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOp                      equality_op,                        ///< [in] Key equality operator
-        ReductionOp                     reduction_op,                       ///< [in] Value reduction operator
-        Offset                          num_items,                          ///< [in] Total number of items to select from
-        cudaStream_t                    stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                             ptx_version,                        ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr         device_scan_init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceReduceByKeySweepKernelPtr range_reduce_by_key_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeySweepKernel
-        KernelConfig                    device_reduce_by_key_sweep_config)  ///< [in] Dispatch parameters that match the policy that \p range_reduce_by_key_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_reduce_by_key_sweep_config.block_threads * device_reduce_by_key_sweep_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[2];
-            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
-
-            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
-            void* allocations[2];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Construct the tile status interface
-            ScanTileState tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Construct the grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                queue,
-                tile_status,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for range_reduce_by_key_kernel
-            int range_reduce_by_key_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_reduce_by_key_sm_occupancy,            // out
-                sm_version,
-                range_reduce_by_key_kernel,
-                device_reduce_by_key_sweep_config.block_threads))) break;
-
-            // Get grid size for scanning tiles
-            dim3 reduce_by_key_grid_size;
-            if (ptx_version <= 130)
-            {
-                // Blocks are launched in order, so just assign one block per tile
-                int max_dim_x = 32 * 1024;
-                reduce_by_key_grid_size.z = 1;
-                reduce_by_key_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
-                reduce_by_key_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-            }
-            else
-            {
-                // Blocks may not be launched in order, so use atomics
-                int range_reduce_by_key_occupancy = range_reduce_by_key_sm_occupancy * sm_count;      // Whole-device occupancy for range_reduce_by_key_kernel
-                reduce_by_key_grid_size.z = 1;
-                reduce_by_key_grid_size.y = 1;
-                reduce_by_key_grid_size.x = (num_tiles < range_reduce_by_key_occupancy) ?
-                    num_tiles :                             // Not enough to fill the device with threadblocks
-                    range_reduce_by_key_occupancy;         // Fill the device with threadblocks
-            }
-
-#if (CUB_PTX_ARCH == 0)
-            // Get current smem bank configuration
-            cudaSharedMemConfig original_smem_config;
-            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
-            cudaSharedMemConfig current_smem_config = original_smem_config;
-
-            // Update smem config if necessary
-            if (current_smem_config != device_reduce_by_key_sweep_config.smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(device_reduce_by_key_sweep_config.smem_config))) break;
-                current_smem_config = device_reduce_by_key_sweep_config.smem_config;
-            }
-#endif
-
-            // Log range_reduce_by_key_kernel configuration
-            if (debug_synchronous) CubLog("Invoking range_reduce_by_key_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                reduce_by_key_grid_size.x, reduce_by_key_grid_size.y, reduce_by_key_grid_size.z, device_reduce_by_key_sweep_config.block_threads, (long long) stream, device_reduce_by_key_sweep_config.items_per_thread, range_reduce_by_key_sm_occupancy);
-
-            // Invoke range_reduce_by_key_kernel
-            range_reduce_by_key_kernel<<<reduce_by_key_grid_size, device_reduce_by_key_sweep_config.block_threads, 0, stream>>>(
-                d_keys_in,
-                d_unique_out,
-                d_values_in,
-                d_aggregates_out,
-                d_num_runs_out,
-                tile_status,
-                equality_op,
-                reduction_op,
-                num_items,
-                num_tiles,
-                queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-#if (CUB_PTX_ARCH == 0)
-            // Reset smem config if necessary
-            if (current_smem_config != original_smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
-            }
-#endif
-
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        KeysInputIterator           d_keys_in,                      ///< [in] Pointer to the input sequence of keys
-        UniqueOutputIterator        d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
-        ValuesInputIterator         d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
-        AggregatesOutputIterator    d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
-        NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
-        EqualityOp                  equality_op,                    ///< [in] Key equality operator
-        ReductionOp                 reduction_op,                   ///< [in] Value reduction operator
-        Offset                      num_items,                      ///< [in] Total number of items to select from
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_reduce_by_key_sweep_config;
-            InitConfigs(ptx_version, device_reduce_by_key_sweep_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_keys_in,
-                d_unique_out,
-                d_values_in,
-                d_aggregates_out,
-                d_num_runs_out,
-                equality_op,
-                reduction_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<Offset, ScanTileState>,
-                DeviceReduceByKeySweepKernel<PtxReduceByKeyPolicy, KeysInputIterator, UniqueOutputIterator, ValuesInputIterator, AggregatesOutputIterator, NumRunsOutputIterator, ScanTileState, EqualityOp, ReductionOp, Offset>,
-                device_reduce_by_key_sweep_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh
deleted file mode 100644
index 403d63ae4..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_reduce_dispatch.cuh
+++ /dev/null
@@ -1,742 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../block_sweep/block_reduce_sweep.cuh"
-#include "../../iterator/constant_input_iterator.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_even_share.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../iterator/arg_index_input_iterator.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
- */
-template <
-    typename                BlockReduceSweepPolicy,     ///< Parameterized BlockReduceSweepPolicy tuning policy type
-    typename                InputIterator,              ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIterator,             ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                Offset,                     ///< Signed integer type for global offsets
-    typename                ReductionOp>                ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(BlockReduceSweepPolicy::BLOCK_THREADS))
-__global__ void DeviceReduceSweepKernel(
-    InputIterator           d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIterator          d_out,                      ///< [out] Pointer to the output aggregate
-    Offset                  num_items,                  ///< [in] Total number of input data items
-    GridEvenShare<Offset>   even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
-    GridQueue<Offset>       queue,                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-    ReductionOp             reduction_op)               ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-{
-    // Data type
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Thread block type for reducing input tiles
-    typedef BlockReduceSweep<BlockReduceSweepPolicy, InputIterator, Offset, ReductionOp> BlockReduceSweepT;
-
-    // Block-wide aggregate
-    T block_aggregate;
-
-    // Shared memory storage
-    __shared__ typename BlockReduceSweepT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockReduceSweepT(temp_storage, d_in, reduction_op).ConsumeRange(
-        num_items,
-        even_share,
-        queue,
-        block_aggregate,
-        Int2Type<BlockReduceSweepPolicy::GRID_MAPPING>());
-
-    // Output result
-    if (threadIdx.x == 0)
-    {
-        d_out[blockIdx.x] = block_aggregate;
-    }
-}
-
-
-/**
- * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized threadblock reductions from a previous multi-block reduction pass.
- */
-template <
-    typename                BlockReduceSweepPolicy,     ///< Parameterized BlockReduceSweepPolicy tuning policy type
-    typename                InputIterator,              ///< Random-access input iterator type for reading input items \iterator
-    typename                OutputIterator,             ///< Output iterator type for recording the reduced aggregate \iterator
-    typename                Offset,                     ///< Signed integer type for global offsets
-    typename                ReductionOp>                ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-__launch_bounds__ (int(BlockReduceSweepPolicy::BLOCK_THREADS), 1)
-__global__ void SingleReduceSweepKernel(
-    InputIterator           d_in,                       ///< [in] Pointer to the input sequence of data items
-    OutputIterator          d_out,                      ///< [out] Pointer to the output aggregate
-    Offset                  num_items,                  ///< [in] Total number of input data items
-    ReductionOp             reduction_op)               ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-{
-    // Data type
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Thread block type for reducing input tiles
-    typedef BlockReduceSweep<BlockReduceSweepPolicy, InputIterator, Offset, ReductionOp> BlockReduceSweepT;
-
-    // Block-wide aggregate
-    T block_aggregate;
-
-    // Shared memory storage
-    __shared__ typename BlockReduceSweepT::TempStorage temp_storage;
-
-    // Consume input tiles
-    BlockReduceSweepT(temp_storage, d_in, reduction_op).ConsumeRange(
-        Offset(0),
-        Offset(num_items),
-        block_aggregate);
-
-    // Output result
-    if (threadIdx.x == 0)
-    {
-        d_out[blockIdx.x] = block_aggregate;
-    }
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceReduce
- */
-template <
-    typename InputIterator,     ///< Random-access input iterator type for reading input items \iterator
-    typename OutputIterator,    ///< Output iterator type for recording the reduced aggregate \iterator
-    typename Offset,            ///< Signed integer type for global offsets
-    typename ReductionOp>       ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-struct DeviceReduceDispatch
-{
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        // RangeReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
-        typedef BlockReduceSweepPolicy<
-                128,                                ///< Threads per thread block
-                24,                                 ///< Items per thread per tile of input
-                4,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG,                           ///< Cache load modifier
-                GRID_MAPPING_DYNAMIC>               ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy1B;
-
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 20,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // RangeReducePolicy4B (GTX Titan: 255.1 GB/s @ 48M 4B items)
-        typedef BlockReduceSweepPolicy<
-                256,                                ///< Threads per thread block
-                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
-                2,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_LDG,                           ///< Cache load modifier
-                GRID_MAPPING_DYNAMIC>               ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy4B;
-
-        // RangeReducePolicy
-        typedef typename If<(sizeof(T) >= 4),
-            RangeReducePolicy4B,
-            RangeReducePolicy1B>::Type RangeReducePolicy;
-
-        // SingleTilePolicy
-        typedef BlockReduceSweepPolicy<
-                256,                                ///< Threads per thread block
-                8,                                  ///< Items per thread per tile of input
-                1,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            SingleTilePolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 2,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // RangeReducePolicy (GTX670: 154.0 @ 48M 4B items)
-        typedef BlockReduceSweepPolicy<
-                256,                                ///< Threads per thread block
-                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
-                1,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy;
-
-        // SingleTilePolicy
-        typedef BlockReduceSweepPolicy<
-                256,                                ///< Threads per thread block
-                24,                                 ///< Items per thread per tile of input
-                4,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_WARP_REDUCTIONS,       ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            SingleTilePolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        // RangeReducePolicy1B (GTX 580: 158.1 GB/s @ 192M 1B items)
-        typedef BlockReduceSweepPolicy<
-                192,                                ///< Threads per thread block
-                24,                                 ///< Items per thread per tile of input
-                4,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                (sizeof(T) == 1) ?                  ///< How to map tiles of input onto thread blocks
-                    GRID_MAPPING_EVEN_SHARE :
-                    GRID_MAPPING_DYNAMIC>
-            RangeReducePolicy1B;
-
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 8,
-            NOMINAL_4B_VEC_ITEMS        = 4,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
-        };
-
-        // RangeReducePolicy4B (GTX 580: 178.9 GB/s @ 48M 4B items)
-        typedef BlockReduceSweepPolicy<
-                128,                                ///< Threads per thread block
-                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
-                VEC_ITEMS,                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_DYNAMIC>               ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy4B;
-
-        // RangeReducePolicy
-        typedef typename If<(sizeof(T) < 4),
-            RangeReducePolicy1B,
-            RangeReducePolicy4B>::Type RangeReducePolicy;
-
-        // SingleTilePolicy
-        typedef BlockReduceSweepPolicy<
-                192,                                ///< Threads per thread block
-                7,                                  ///< Items per thread per tile of input
-                1,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            SingleTilePolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 8,
-            NOMINAL_4B_VEC_ITEMS        = 2,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
-        };
-
-        // RangeReducePolicy
-        typedef BlockReduceSweepPolicy<
-                128,                                ///< Threads per thread block
-                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
-                VEC_ITEMS,                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy;
-
-        // SingleTilePolicy
-        typedef BlockReduceSweepPolicy<
-                32,                                 ///< Threads per thread block
-                4,                                  ///< Items per thread per tile of input
-                VEC_ITEMS,                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            SingleTilePolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 8,
-            NOMINAL_4B_VEC_ITEMS        = 2,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-            VEC_ITEMS                   = CUB_MIN(NOMINAL_4B_VEC_ITEMS, CUB_MAX(1, (NOMINAL_4B_VEC_ITEMS * 4 / sizeof(T)))),
-        };
-
-        // RangeReducePolicy
-        typedef BlockReduceSweepPolicy<
-                128,                                ///< Threads per thread block
-                ITEMS_PER_THREAD,                   ///< Items per thread per tile of input
-                VEC_ITEMS,                          ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            RangeReducePolicy;
-
-        // SingleTilePolicy
-        typedef BlockReduceSweepPolicy<
-                32,                                 ///< Threads per thread block
-                4,                                  ///< Items per thread per tile of input
-                4,                                  ///< Number of items per vectorized load
-                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
-                LOAD_DEFAULT,                       ///< Cache load modifier
-                GRID_MAPPING_EVEN_SHARE>            ///< How to map tiles of input onto thread blocks
-            SingleTilePolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRangeReducePolicy   : PtxPolicy::RangeReducePolicy {};
-    struct PtxSingleTilePolicy     : PtxPolicy::SingleTilePolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_reduce_sweep_config,
-        KernelConfig    &single_reduce_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_reduce_sweep_config.template Init<PtxRangeReducePolicy>();
-        single_reduce_sweep_config.template Init<PtxSingleTilePolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_reduce_sweep_config.template     Init<typename Policy350::RangeReducePolicy>();
-            single_reduce_sweep_config.template     Init<typename Policy350::SingleTilePolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_reduce_sweep_config.template     Init<typename Policy300::RangeReducePolicy>();
-            single_reduce_sweep_config.template     Init<typename Policy300::SingleTilePolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_reduce_sweep_config.template     Init<typename Policy200::RangeReducePolicy>();
-            single_reduce_sweep_config.template     Init<typename Policy200::SingleTilePolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_reduce_sweep_config.template     Init<typename Policy130::RangeReducePolicy>();
-            single_reduce_sweep_config.template     Init<typename Policy130::SingleTilePolicy>();
-        }
-        else
-        {
-            device_reduce_sweep_config.template     Init<typename Policy100::RangeReducePolicy>();
-            single_reduce_sweep_config.template     Init<typename Policy100::SingleTilePolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        int                     vector_load_length;
-        BlockReduceAlgorithm    block_algorithm;
-        CacheLoadModifier       load_modifier;
-        GridMappingStrategy     grid_mapping;
-
-        template <typename BlockPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockPolicy::ITEMS_PER_THREAD;
-            vector_load_length          = BlockPolicy::VECTOR_LOAD_LENGTH;
-            block_algorithm             = BlockPolicy::BLOCK_ALGORITHM;
-            load_modifier               = BlockPolicy::LOAD_MODIFIER;
-            grid_mapping                = BlockPolicy::GRID_MAPPING;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d threads, %d per thread, %d veclen, %d algo, %d loadmod, %d mapping",
-                block_threads,
-                items_per_thread,
-                vector_load_length,
-                block_algorithm,
-                load_modifier,
-                grid_mapping);
-        }
-    };
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction using the
-     * specified kernel functions.
-     *
-     * If the input is larger than a single tile, this method uses two-passes of
-     * kernel invocations.
-     */
-    template <
-        typename                    DeviceReduceSweepKernelPtr,         ///< Function type of cub::DeviceReduceSweepKernel
-        typename                    SingleReducePartialsKernelPtr,      ///< Function type of cub::SingleReduceSweepKernel for consuming partial reductions (T*)
-        typename                    SingleReduceSweepKernelPtr,         ///< Function type of cub::SingleReduceSweepKernel for consuming input (InputIterator)
-        typename                    FillAndResetDrainKernelPtr>         ///< Function type of cub::FillAndResetDrainKernel
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                            *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                          &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator                   d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator                  d_out,                          ///< [out] Pointer to the output aggregate
-        Offset                          num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOp                     reduction_op,                   ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        cudaStream_t                    stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                            debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        FillAndResetDrainKernelPtr      prepare_drain_kernel,           ///< [in] Kernel function pointer to parameterization of cub::FillAndResetDrainKernel
-        DeviceReduceSweepKernelPtr      device_reduce_sweep_kernel,     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSweepKernel
-        SingleReducePartialsKernelPtr   single_reduce_partials_kernel,  ///< [in] Kernel function pointer to parameterization of cub::SingleReduceSweepKernel for consuming partial reductions (T*)
-        SingleReduceSweepKernelPtr      single_reduce_sweep_kernel,     ///< [in] Kernel function pointer to parameterization of cub::SingleReduceSweepKernel for consuming input (InputIterator)
-        KernelConfig                    device_reduce_sweep_config,     ///< [in] Dispatch parameters that match the policy that \p range_reduce_kernel_ptr was compiled for
-        KernelConfig                    single_reduce_sweep_config)     ///< [in] Dispatch parameters that match the policy that \p single_reduce_sweep_kernel was compiled for
-    {
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported );
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Tile size of device_reduce_sweep_kernel
-            int tile_size = device_reduce_sweep_config.block_threads * device_reduce_sweep_config.items_per_thread;
-
-            if ((device_reduce_sweep_kernel == NULL) || (num_items <= tile_size))
-            {
-                // Dispatch a single-block reduction kernel
-
-                // Return if the caller is simply requesting the size of the storage allocation
-                if (d_temp_storage == NULL)
-                {
-                    temp_storage_bytes = 1;
-                    return cudaSuccess;
-                }
-
-                // Log single_reduce_sweep_kernel configuration
-                if (debug_synchronous) CubLog("Invoking ReduceSingle<<<1, %d, 0, %lld>>>(), %d items per thread\n",
-                    single_reduce_sweep_config.block_threads, (long long) stream, single_reduce_sweep_config.items_per_thread);
-
-                // Invoke single_reduce_sweep_kernel
-                single_reduce_sweep_kernel<<<1, single_reduce_sweep_config.block_threads>>>(
-                    d_in,
-                    d_out,
-                    num_items,
-                    reduction_op);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            }
-            else
-            {
-                // Dispatch two kernels: (1) a multi-block kernel to compute
-                // privatized per-block reductions, and (2) a single-block
-                // to reduce those partial reductions
-
-                // Get SM occupancy for device_reduce_sweep_kernel
-                int range_reduce_sm_occupancy;
-                if (CubDebug(error = MaxSmOccupancy(
-                    range_reduce_sm_occupancy,
-                    sm_version,
-                    device_reduce_sweep_kernel,
-                    device_reduce_sweep_config.block_threads))) break;
-
-                // Get device occupancy for device_reduce_sweep_kernel
-                int range_reduce_occupancy = range_reduce_sm_occupancy * sm_count;
-
-                // Even-share work distribution
-                int subscription_factor = range_reduce_sm_occupancy;     // Amount of CTAs to oversubscribe the device beyond actively-resident (heuristic)
-                GridEvenShare<Offset> even_share(
-                    num_items,
-                    range_reduce_occupancy * subscription_factor,
-                    tile_size);
-
-                // Get grid size for device_reduce_sweep_kernel
-                int range_reduce_grid_size;
-                switch (device_reduce_sweep_config.grid_mapping)
-                {
-                case GRID_MAPPING_EVEN_SHARE:
-
-                    // Work is distributed evenly
-                    range_reduce_grid_size = even_share.grid_size;
-                    break;
-
-                case GRID_MAPPING_DYNAMIC:
-
-                    // Work is distributed dynamically
-                    int num_tiles = (num_items + tile_size - 1) / tile_size;
-                    range_reduce_grid_size = (num_tiles < range_reduce_occupancy) ?
-                        num_tiles :                     // Not enough to fill the device with threadblocks
-                        range_reduce_occupancy;         // Fill the device with threadblocks
-                    break;
-                };
-
-                // Temporary storage allocation requirements
-                void* allocations[2];
-                size_t allocation_sizes[2] =
-                {
-                    range_reduce_grid_size * sizeof(T),     // bytes needed for privatized block reductions
-                    GridQueue<int>::AllocationSize()        // bytes needed for grid queue descriptor
-                };
-
-                // Alias the temporary allocations from the single storage blob (or set the necessary size of the blob)
-                if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-                if (d_temp_storage == NULL)
-                {
-                    // Return if the caller is simply requesting the size of the storage allocation
-                    return cudaSuccess;
-                }
-
-                // Alias the allocation for the privatized per-block reductions
-                T *d_block_reductions = (T*) allocations[0];
-
-                // Alias the allocation for the grid queue descriptor
-                GridQueue<Offset> queue(allocations[1]);
-
-                // Prepare the dynamic queue descriptor if necessary
-                if (device_reduce_sweep_config.grid_mapping == GRID_MAPPING_DYNAMIC)
-                {
-                    // Prepare queue using a kernel so we know it gets prepared once per operation
-                    if (debug_synchronous) CubLog("Invoking prepare_drain_kernel<<<1, 1, 0, %lld>>>()\n", (long long) stream);
-
-                    // Invoke prepare_drain_kernel
-                    prepare_drain_kernel<<<1, 1, 0, stream>>>(queue, num_items);
-
-                    // Check for failure to launch
-                    if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                    // Sync the stream if specified to flush runtime errors
-                    if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-                }
-
-                // Log device_reduce_sweep_kernel configuration
-                if (debug_synchronous) CubLog("Invoking device_reduce_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                    range_reduce_grid_size, device_reduce_sweep_config.block_threads, (long long) stream, device_reduce_sweep_config.items_per_thread, range_reduce_sm_occupancy);
-
-                // Invoke device_reduce_sweep_kernel
-                device_reduce_sweep_kernel<<<range_reduce_grid_size, device_reduce_sweep_config.block_threads, 0, stream>>>(
-                    d_in,
-                    d_block_reductions,
-                    num_items,
-                    even_share,
-                    queue,
-                    reduction_op);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-                // Log single_reduce_sweep_kernel configuration
-                if (debug_synchronous) CubLog("Invoking single_reduce_sweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
-                    1, single_reduce_sweep_config.block_threads, (long long) stream, single_reduce_sweep_config.items_per_thread);
-
-                // Invoke single_reduce_sweep_kernel
-                single_reduce_partials_kernel<<<1, single_reduce_sweep_config.block_threads, 0, stream>>>(
-                    d_block_reductions,
-                    d_out,
-                    range_reduce_grid_size,
-                    reduction_op);
-
-                // Check for failure to launch
-                if (CubDebug(error = cudaPeekAtLastError())) break;
-
-                // Sync the stream if specified to flush runtime errors
-                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-            }
-        }
-        while (0);
-
-        return error;
-
-#endif // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine for computing a device-wide reduction
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                               ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                              ///< [out] Pointer to the output aggregate
-        Offset                      num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
-        ReductionOp                 reduction_op,                       ///< [in] Binary reduction functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        cudaStream_t                stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_reduce_sweep_config;
-            KernelConfig single_reduce_sweep_config;
-            InitConfigs(ptx_version, device_reduce_sweep_config, single_reduce_sweep_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_out,
-                num_items,
-                reduction_op,
-                stream,
-                debug_synchronous,
-                FillAndResetDrainKernel<Offset>,
-                DeviceReduceSweepKernel<PtxRangeReducePolicy, InputIterator, T*, Offset, ReductionOp>,
-                SingleReduceSweepKernel<PtxSingleTilePolicy, T*, OutputIterator, Offset, ReductionOp>,
-                SingleReduceSweepKernel<PtxSingleTilePolicy, InputIterator, OutputIterator, Offset, ReductionOp>,
-                device_reduce_sweep_config,
-                single_reduce_sweep_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh
deleted file mode 100644
index 7b372e11f..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_rle_dispatch.cuh
+++ /dev/null
@@ -1,566 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "device_scan_dispatch.cuh"
-#include "../../block_sweep/block_rle_sweep.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            BlockRleSweepPolicy,        ///< Parameterized BlockRleSweepPolicy tuning policy type
-    typename            InputIterator,              ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIterator,      ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIterator,      ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIterator,      ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            ScanTileState,              ///< Tile status interface type
-    typename            EqualityOp,                 ///< T equality operator type
-    typename            Offset>                     ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockRleSweepPolicy::BLOCK_THREADS))
-__global__ void DeviceRleSweepKernel(
-    InputIterator               d_in,               ///< [in] Pointer to input sequence of data items
-    OffsetsOutputIterator       d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
-    LengthsOutputIterator       d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
-    NumRunsOutputIterator       d_num_runs_out,         ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-    ScanTileState               tile_status,        ///< [in] Tile status interface
-    EqualityOp                  equality_op,        ///< [in] Equality operator for input items
-    Offset                      num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
-    GridQueue<int>              queue)              ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for selecting data from input tiles
-    typedef BlockRleSweep<
-        BlockRleSweepPolicy,
-        InputIterator,
-        OffsetsOutputIterator,
-        LengthsOutputIterator,
-        EqualityOp,
-        Offset> BlockRleSweepT;
-
-    // Shared memory for BlockRleSweep
-    __shared__ typename BlockRleSweepT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockRleSweepT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        queue,
-        tile_status,
-        d_num_runs_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
- */
-template <
-    typename            InputIterator,              ///< Random-access input iterator type for reading input items \iterator
-    typename            OffsetsOutputIterator,      ///< Random-access output iterator type for writing run-offset values \iterator
-    typename            LengthsOutputIterator,      ///< Random-access output iterator type for writing run-length values \iterator
-    typename            NumRunsOutputIterator,      ///< Output iterator type for recording the number of runs encountered \iterator
-    typename            EqualityOp,                 ///< T equality operator type
-    typename            Offset>                     ///< Signed integer type for global offsets
-struct DeviceRleDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Signed integer type for run lengths
-    typedef typename std::iterator_traits<LengthsOutputIterator>::value_type Length;
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ReduceByKeyScanTileState<Length, Offset> ScanTileState;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockRleSweepPolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockRleSweepPolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockRleSweepPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            RleSweepPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockRleSweepPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockRleSweepPolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RleSweepPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_rle_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_rle_config.template Init<PtxRleSweepPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
-        }
-        else
-        {
-            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockRleSweepPolicy.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    store_warp_time_slicing;
-        BlockScanAlgorithm      scan_algorithm;
-        cudaSharedMemConfig     smem_config;
-
-        template <typename BlockRleSweepPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockRleSweepPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockRleSweepPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockRleSweepPolicy::LOAD_ALGORITHM;
-            store_warp_time_slicing     = BlockRleSweepPolicy::STORE_WARP_TIME_SLICING;
-            scan_algorithm              = BlockRleSweepPolicy::SCAN_ALGORITHM;
-            smem_config                 = cudaSharedMemBankSizeEightByte;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_warp_time_slicing,
-                scan_algorithm,
-                smem_config);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide run-length-encode using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        OffsetsOutputIterator       d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
-        LengthsOutputIterator       d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
-        NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
-        EqualityOp                  equality_op,                    ///< [in] Equality operator for input items
-        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
-        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[2];
-            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
-
-            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
-            void* allocations[2];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Construct the tile status interface
-            ScanTileState tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Construct the grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                queue,
-                tile_status,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for device_rle_sweep_kernel
-            int device_rle_kernel_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                device_rle_kernel_sm_occupancy,            // out
-                sm_version,
-                device_rle_sweep_kernel,
-                device_rle_config.block_threads))) break;
-
-            // Get grid size for scanning tiles
-            dim3 rle_grid_size;
-            int max_dim_x = 32 * 1024;
-            rle_grid_size.z = 1;
-            rle_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
-            rle_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log device_rle_sweep_kernel configuration
-            if (debug_synchronous) CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                rle_grid_size.x, rle_grid_size.y, rle_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
-
-#if (CUB_PTX_ARCH == 0)
-            // Get current smem bank configuration
-            cudaSharedMemConfig original_smem_config;
-            if (CubDebug(error = cudaDeviceGetSharedMemConfig(&original_smem_config))) break;
-            cudaSharedMemConfig current_smem_config = original_smem_config;
-
-            // Update smem config if necessary
-            if (current_smem_config != device_rle_config.smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(device_rle_config.smem_config))) break;
-                current_smem_config = device_rle_config.smem_config;
-            }
-#endif
-
-            // Invoke device_rle_sweep_kernel
-            device_rle_sweep_kernel<<<rle_grid_size, device_rle_config.block_threads, 0, stream>>>(
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                tile_status,
-                equality_op,
-                num_items,
-                num_tiles,
-                queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-#if (CUB_PTX_ARCH == 0)
-            // Reset smem config if necessary
-            if (current_smem_config != original_smem_config)
-            {
-                if (CubDebug(error = cudaDeviceSetSharedMemConfig(original_smem_config))) break;
-            }
-#endif
-
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to input sequence of data items
-        OffsetsOutputIterator       d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
-        LengthsOutputIterator       d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
-        NumRunsOutputIterator       d_num_runs_out,                     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
-        EqualityOp                  equality_op,                    ///< [in] Equality operator for input items
-        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_rle_config;
-            InitConfigs(ptx_version, device_rle_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_offsets_out,
-                d_lengths_out,
-                d_num_runs_out,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<Offset, ScanTileState>,
-                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIterator, OffsetsOutputIterator, LengthsOutputIterator, NumRunsOutputIterator, ScanTileState, EqualityOp, Offset>,
-                device_rle_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh
deleted file mode 100644
index 8dff45e5d..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_scan_dispatch.cuh
+++ /dev/null
@@ -1,565 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "../../block_sweep/block_scan_sweep.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_debug.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Initialization kernel for tile status initialization (multi-block)
- */
-template <
-    typename            Offset,                 ///< Signed integer type for global offsets
-    typename            ScanTileState>          ///< Tile status interface type
-__global__ void DeviceScanInitKernel(
-    GridQueue<Offset>   grid_queue,             ///< [in] Descriptor for performing dynamic mapping of input tiles to thread blocks
-    ScanTileState       tile_status,            ///< [in] Tile status interface
-    int                 num_tiles)              ///< [in] Number of tiles
-{
-    // Reset queue descriptor
-    if ((blockIdx.x == 0) && (threadIdx.x == 0))
-        grid_queue.FillAndResetDrain(num_tiles);
-
-    // Initialize tile status
-    tile_status.InitializeStatus(num_tiles);
-}
-
-
-/**
- * Scan kernel entry point (multi-block)
- */
-template <
-    typename            BlockScanSweepPolicy,       ///< Parameterized BlockScanSweepPolicy tuning policy type
-    typename            InputIterator,              ///< Random-access input iterator type for reading scan inputs \iterator
-    typename            OutputIterator,             ///< Random-access output iterator type for writing scan outputs \iterator
-    typename            ScanTileState,              ///< Tile status interface type
-    typename            ScanOp,                     ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename            Identity,                   ///< Identity value type (cub::NullType for inclusive scans)
-    typename            Offset>                     ///< Signed integer type for global offsets
-__launch_bounds__ (int(BlockScanSweepPolicy::BLOCK_THREADS))
-__global__ void DeviceScanSweepKernel(
-    InputIterator       d_in,                       ///< Input data
-    OutputIterator      d_out,                      ///< Output data
-    ScanTileState       tile_status,                ///< [in] Tile status interface
-    ScanOp              scan_op,                    ///< Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-    Identity            identity,                   ///< Identity element
-    Offset              num_items,                  ///< Total number of scan items for the entire problem
-    GridQueue<int>      queue)                      ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for scanning input tiles
-    typedef BlockScanSweep<
-        BlockScanSweepPolicy,
-        InputIterator,
-        OutputIterator,
-        ScanOp,
-        Identity,
-        Offset> BlockScanSweepT;
-
-    // Shared memory for BlockScanSweep
-    __shared__ typename BlockScanSweepT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockScanSweepT(temp_storage, d_in, d_out, scan_op, identity).ConsumeRange(
-        num_items,
-        queue,
-        tile_status);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
- */
-template <
-    typename InputIterator,      ///< Random-access input iterator type for reading scan inputs \iterator
-    typename OutputIterator,     ///< Random-access output iterator type for writing scan outputs \iterator
-    typename ScanOp,             ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
-    typename Identity,           ///< Identity value type (cub::NullType for inclusive scans)
-    typename Offset>             ///< Signed integer type for global offsets
-struct DeviceScanDispatch
-{
-    enum
-    {
-        INIT_KERNEL_THREADS     = 128
-    };
-
-    // Data type
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<T> ScanTileState;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 12,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
-        typedef BlockScanSweepPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                false,
-                LOAD_LDG,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeScanPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockScanSweepPolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                false,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                false,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeScanPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
-        typedef BlockScanSweepPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                false,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                false,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeScanPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 21,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockScanSweepPolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                false,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                false,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeScanPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockScanSweepPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                true,
-                LOAD_DEFAULT,
-                BLOCK_STORE_WARP_TRANSPOSE,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RangeScanPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRangeScanPolicy : PtxPolicy::RangeScanPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_scan_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_scan_sweep_config.template Init<PtxRangeScanPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_scan_sweep_config.template Init<typename Policy350::RangeScanPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_scan_sweep_config.template Init<typename Policy300::RangeScanPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_scan_sweep_config.template Init<typename Policy200::RangeScanPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_scan_sweep_config.template Init<typename Policy130::RangeScanPolicy>();
-        }
-        else
-        {
-            device_scan_sweep_config.template Init<typename Policy100::RangeScanPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockScanSweepPolicy.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        BlockStoreAlgorithm     store_policy;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename BlockScanSweepPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockScanSweepPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockScanSweepPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockScanSweepPolicy::LOAD_ALGORITHM;
-            store_policy                = BlockScanSweepPolicy::STORE_ALGORITHM;
-            scan_algorithm              = BlockScanSweepPolicy::SCAN_ALGORITHM;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_policy,
-                scan_algorithm);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide prefix scan using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceScanSweepKernelPtr>       ///< Function type of cub::DeviceScanSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator              d_out,                          ///< [out] Pointer to the output sequence of data items
-        ScanOp                      scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        Identity                    identity,                       ///< [in] Identity element
-        Offset                      num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceScanSweepKernelPtr    device_scan_sweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceScanSweepKernel
-        KernelConfig                device_scan_sweep_config)       ///< [in] Dispatch parameters that match the policy that \p device_scan_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_scan_sweep_config.block_threads * device_scan_sweep_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[2];
-            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
-
-            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
-            void* allocations[2];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Construct the tile status interface
-            ScanTileState tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Construct the grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                queue,
-                tile_status,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for device_scan_sweep_kernel
-            int range_scan_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_scan_sm_occupancy,            // out
-                sm_version,
-                device_scan_sweep_kernel,
-                device_scan_sweep_config.block_threads))) break;
-
-            // Get grid size for scanning tiles
-            dim3 scan_grid_size;
-            if (ptx_version <= 130)
-            {
-                // Blocks are launched in order, so just assign one block per tile
-                int max_dim_x = 32 * 1024;
-                scan_grid_size.z = 1;
-                scan_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
-                scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-            }
-            else
-            {
-                // Blocks may not be launched in order, so use atomics
-                int range_scan_occupancy = range_scan_sm_occupancy * sm_count;        // Whole-device occupancy for device_scan_sweep_kernel
-                scan_grid_size.z = 1;
-                scan_grid_size.y = 1;
-                scan_grid_size.x = (num_tiles < range_scan_occupancy) ?
-                    num_tiles :                     // Not enough to fill the device with threadblocks
-                    range_scan_occupancy;          // Fill the device with threadblocks
-            }
-
-            // Log device_scan_sweep_kernel configuration
-            if (debug_synchronous) CubLog("Invoking device_scan_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_scan_sweep_config.block_threads, (long long) stream, device_scan_sweep_config.items_per_thread, range_scan_sm_occupancy);
-
-            // Invoke device_scan_sweep_kernel
-            device_scan_sweep_kernel<<<scan_grid_size, device_scan_sweep_config.block_threads, 0, stream>>>(
-                d_in,
-                d_out,
-                tile_status,
-                scan_op,
-                identity,
-                num_items,
-                queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void            *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t          &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator   d_in,                           ///< [in] Pointer to the input sequence of data items
-        OutputIterator  d_out,                          ///< [out] Pointer to the output sequence of data items
-        ScanOp          scan_op,                        ///< [in] Binary scan functor (e.g., an instance of cub::Sum, cub::Min, cub::Max, etc.)
-        Identity        identity,                       ///< [in] Identity element
-        Offset          num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
-        cudaStream_t    stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool            debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_scan_sweep_config;
-            InitConfigs(ptx_version, device_scan_sweep_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_out,
-                scan_op,
-                identity,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<Offset, ScanTileState>,
-                DeviceScanSweepKernel<PtxRangeScanPolicy, InputIterator, OutputIterator, ScanTileState, ScanOp, Identity, Offset>,
-                device_scan_sweep_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh b/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh
deleted file mode 100644
index ba35f8bef..000000000
--- a/thrust/system/cuda/detail/cub/device/dispatch/device_select_dispatch.cuh
+++ /dev/null
@@ -1,550 +0,0 @@
-
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within global memory.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include <iterator>
-
-#include "device_scan_dispatch.cuh"
-#include "../../block_sweep/block_select_sweep.cuh"
-#include "../../thread/thread_operators.cuh"
-#include "../../grid/grid_queue.cuh"
-#include "../../util_device.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/******************************************************************************
- * Kernel entry points
- *****************************************************************************/
-
-/**
- * Select kernel entry point (multi-block)
- *
- * Performs functor-based selection if SelectOp functor type != NullType
- * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
- * Otherwise performs discontinuity selection (keep unique)
- */
-template <
-    typename            BlockSelectSweepPolicy,     ///< Parameterized BlockSelectSweepPolicy tuning policy type
-    typename            InputIterator,              ///< Random-access input iterator type for reading input items
-    typename            FlagsInputIterator,               ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename            SelectedOutputIterator,             ///< Random-access output iterator type for writing selected items
-    typename            NumSelectedIterator,        ///< Output iterator type for recording the number of items selected
-    typename            ScanTileState,              ///< Tile status interface type
-    typename            SelectOp,                   ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename            EqualityOp,                 ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename            Offset,                     ///< Signed integer type for global offsets
-    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
-__launch_bounds__ (int(BlockSelectSweepPolicy::BLOCK_THREADS))
-__global__ void DeviceSelectSweepKernel(
-    InputIterator       d_in,                       ///< [in] Pointer to the input sequence of data items
-    FlagsInputIterator        d_flags,                    ///< [in] Pointer to the input sequence of selection flags
-    SelectedOutputIterator      d_selected_out,                      ///< [out] Pointer to the output sequence of selected data items
-    NumSelectedIterator d_num_selected_out,             ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-    ScanTileState       tile_status,                ///< [in] Tile status interface
-    SelectOp            select_op,                  ///< [in] Selection operator
-    EqualityOp          equality_op,                ///< [in] Equality operator
-    Offset              num_items,                  ///< [in] Total number of input items (i.e., length of \p d_in)
-    int                 num_tiles,                  ///< [in] Total number of tiles for the entire problem
-    GridQueue<int>      queue)                      ///< [in] Drain queue descriptor for dynamically mapping tile data onto thread blocks
-{
-    // Thread block type for selecting data from input tiles
-    typedef BlockSelectSweep<
-        BlockSelectSweepPolicy,
-        InputIterator,
-        FlagsInputIterator,
-        SelectedOutputIterator,
-        SelectOp,
-        EqualityOp,
-        Offset,
-        KEEP_REJECTS> BlockSelectSweepT;
-
-    // Shared memory for BlockSelectSweep
-    __shared__ typename BlockSelectSweepT::TempStorage temp_storage;
-
-    // Process tiles
-    BlockSelectSweepT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
-        num_tiles,
-        queue,
-        tile_status,
-        d_num_selected_out);
-}
-
-
-
-
-/******************************************************************************
- * Dispatch
- ******************************************************************************/
-
-/**
- * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
- */
-template <
-    typename    InputIterator,                  ///< Random-access input iterator type for reading input items
-    typename    FlagsInputIterator,                   ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
-    typename    SelectedOutputIterator,                 ///< Random-access output iterator type for writing selected items
-    typename    NumSelectedIterator,            ///< Output iterator type for recording the number of items selected
-    typename    SelectOp,                       ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
-    typename    EqualityOp,                     ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
-    typename    Offset,                         ///< Signed integer type for global offsets
-    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
-struct DeviceSelectDispatch
-{
-    /******************************************************************************
-     * Types and constants
-     ******************************************************************************/
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-    // Data type of flag iterator
-    typedef typename std::iterator_traits<FlagsInputIterator>::value_type Flag;
-
-    enum
-    {
-        INIT_KERNEL_THREADS = 128,
-    };
-
-    // Tile status descriptor interface type
-    typedef ScanTileState<Offset> ScanTileState;
-
-
-    /******************************************************************************
-     * Tuning policies
-     ******************************************************************************/
-
-    /// SM35
-    struct Policy350
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 17,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockSelectSweepPolicy<
-                96,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_DIRECT,
-                LOAD_LDG,
-                true,
-                BLOCK_SCAN_WARP_SCANS>
-            RangeSelectPolicy;
-    };
-
-    /// SM30
-    struct Policy300
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 5,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockSelectSweepPolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeSelectPolicy;
-    };
-
-    /// SM20
-    struct Policy200
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockSelectSweepPolicy<
-                128,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                false,
-                BLOCK_SCAN_WARP_SCANS>
-            RangeSelectPolicy;
-    };
-
-    /// SM13
-    struct Policy130
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockSelectSweepPolicy<
-                64,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeSelectPolicy;
-    };
-
-    /// SM10
-    struct Policy100
-    {
-        enum {
-            NOMINAL_4B_ITEMS_PER_THREAD = 9,
-            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
-        };
-
-        typedef BlockSelectSweepPolicy<
-                256,
-                ITEMS_PER_THREAD,
-                BLOCK_LOAD_WARP_TRANSPOSE,
-                LOAD_DEFAULT,
-                true,
-                BLOCK_SCAN_RAKING_MEMOIZE>
-            RangeSelectPolicy;
-    };
-
-
-    /******************************************************************************
-     * Tuning policies of current PTX compiler pass
-     ******************************************************************************/
-
-#if (CUB_PTX_ARCH >= 350)
-    typedef Policy350 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 300)
-    typedef Policy300 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 200)
-    typedef Policy200 PtxPolicy;
-
-#elif (CUB_PTX_ARCH >= 130)
-    typedef Policy130 PtxPolicy;
-
-#else
-    typedef Policy100 PtxPolicy;
-
-#endif
-
-    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
-    struct PtxRangeSelectPolicy : PtxPolicy::RangeSelectPolicy {};
-
-
-    /******************************************************************************
-     * Utilities
-     ******************************************************************************/
-
-    /**
-     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
-     */
-    template <typename KernelConfig>
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static void InitConfigs(
-        int             ptx_version,
-        KernelConfig    &device_select_sweep_config)
-    {
-    #if (CUB_PTX_ARCH > 0)
-
-        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
-        device_select_sweep_config.template Init<PtxRangeSelectPolicy>();
-
-    #else
-
-        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
-        if (ptx_version >= 350)
-        {
-            device_select_sweep_config.template Init<typename Policy350::RangeSelectPolicy>();
-        }
-        else if (ptx_version >= 300)
-        {
-            device_select_sweep_config.template Init<typename Policy300::RangeSelectPolicy>();
-        }
-        else if (ptx_version >= 200)
-        {
-            device_select_sweep_config.template Init<typename Policy200::RangeSelectPolicy>();
-        }
-        else if (ptx_version >= 130)
-        {
-            device_select_sweep_config.template Init<typename Policy130::RangeSelectPolicy>();
-        }
-        else
-        {
-            device_select_sweep_config.template Init<typename Policy100::RangeSelectPolicy>();
-        }
-
-    #endif
-    }
-
-
-    /**
-     * Kernel kernel dispatch configuration.  Mirrors the constants within BlockSelectSweepPolicy.
-     */
-    struct KernelConfig
-    {
-        int                     block_threads;
-        int                     items_per_thread;
-        BlockLoadAlgorithm      load_policy;
-        bool                    store_warp_time_slicing;
-        BlockScanAlgorithm      scan_algorithm;
-
-        template <typename BlockSelectSweepPolicy>
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Init()
-        {
-            block_threads               = BlockSelectSweepPolicy::BLOCK_THREADS;
-            items_per_thread            = BlockSelectSweepPolicy::ITEMS_PER_THREAD;
-            load_policy                 = BlockSelectSweepPolicy::LOAD_ALGORITHM;
-            store_warp_time_slicing     = BlockSelectSweepPolicy::STORE_WARP_TIME_SLICING;
-            scan_algorithm              = BlockSelectSweepPolicy::SCAN_ALGORITHM;
-        }
-
-        CUB_RUNTIME_FUNCTION __forceinline__
-        void Print()
-        {
-            printf("%d, %d, %d, %d, %d",
-                block_threads,
-                items_per_thread,
-                load_policy,
-                store_warp_time_slicing,
-                scan_algorithm);
-        }
-    };
-
-
-    /******************************************************************************
-     * Dispatch entrypoints
-     ******************************************************************************/
-
-    /**
-     * Internal dispatch routine for computing a device-wide selection using the
-     * specified kernel functions.
-     */
-    template <
-        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
-        typename                    DeviceSelectSweepKernelPtr>     ///< Function type of cub::DeviceSelectSweepKernelPtr
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIterator          d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        SelectedOutputIterator      d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOp                    select_op,                      ///< [in] Selection operator
-        EqualityOp                  equality_op,                    ///< [in] Equality operator
-        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
-        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
-        DeviceSelectSweepKernelPtr  device_select_sweep_kernel,     ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
-        KernelConfig                device_select_sweep_config)     ///< [in] Dispatch parameters that match the policy that \p device_select_sweep_kernel was compiled for
-    {
-
-#ifndef CUB_RUNTIME_ENABLED
-
-        // Kernel launch not supported from this device
-        return CubDebug(cudaErrorNotSupported);
-
-#else
-
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get device ordinal
-            int device_ordinal;
-            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-            // Get device SM version
-            int sm_version;
-            if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-            // Get SM count
-            int sm_count;
-            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
-
-            // Number of input tiles
-            int tile_size = device_select_sweep_config.block_threads * device_select_sweep_config.items_per_thread;
-            int num_tiles = (num_items + tile_size - 1) / tile_size;
-
-            // Specify temporary storage allocation requirements
-            size_t  allocation_sizes[2];
-            if (CubDebug(error = ScanTileState::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
-            allocation_sizes[1] = GridQueue<int>::AllocationSize();                                             // bytes needed for grid queue descriptor
-
-            // Compute allocation pointers into the single storage blob (or set the necessary size of the blob)
-            void* allocations[2];
-            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
-            if (d_temp_storage == NULL)
-            {
-                // Return if the caller is simply requesting the size of the storage allocation
-                return cudaSuccess;
-            }
-
-            // Construct the tile status interface
-            ScanTileState tile_status;
-            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
-
-            // Construct the grid queue descriptor
-            GridQueue<int> queue(allocations[1]);
-
-            // Log device_scan_init_kernel configuration
-            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
-            if (debug_synchronous) CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
-
-            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
-            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
-                queue,
-                tile_status,
-                num_tiles);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-
-            // Get SM occupancy for device_select_sweep_kernel
-            int range_select_sm_occupancy;
-            if (CubDebug(error = MaxSmOccupancy(
-                range_select_sm_occupancy,            // out
-                sm_version,
-                device_select_sweep_kernel,
-                device_select_sweep_config.block_threads))) break;
-
-            // Get grid size for scanning tiles
-            dim3 select_grid_size;
-            int max_dim_x = 32 * 1024;
-            select_grid_size.z = 1;
-            select_grid_size.y = (num_tiles + max_dim_x - 1) / max_dim_x;
-            select_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
-
-            // Log device_select_sweep_kernel configuration
-            if (debug_synchronous) CubLog("Invoking device_select_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
-                select_grid_size.x, select_grid_size.y, select_grid_size.z, device_select_sweep_config.block_threads, (long long) stream, device_select_sweep_config.items_per_thread, range_select_sm_occupancy);
-
-            // Invoke device_select_sweep_kernel
-            device_select_sweep_kernel<<<select_grid_size, device_select_sweep_config.block_threads, 0, stream>>>(
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                tile_status,
-                select_op,
-                equality_op,
-                num_items,
-                num_tiles,
-                queue);
-
-            // Check for failure to launch
-            if (CubDebug(error = cudaPeekAtLastError())) break;
-
-            // Sync the stream if specified to flush runtime errors
-            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
-        }
-        while (0);
-
-        return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-    }
-
-
-    /**
-     * Internal dispatch routine
-     */
-    CUB_RUNTIME_FUNCTION __forceinline__
-    static cudaError_t Dispatch(
-        void                        *d_temp_storage,                ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
-        InputIterator               d_in,                           ///< [in] Pointer to the input sequence of data items
-        FlagsInputIterator          d_flags,                        ///< [in] Pointer to the input sequence of selection flags
-        SelectedOutputIterator      d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
-        NumSelectedIterator         d_num_selected_out,                 ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
-        SelectOp                    select_op,                      ///< [in] Selection operator
-        EqualityOp                  equality_op,                    ///< [in] Equality operator
-        Offset                      num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
-        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
-        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
-    {
-        cudaError error = cudaSuccess;
-        do
-        {
-            // Get PTX version
-            int ptx_version;
-    #if (CUB_PTX_ARCH == 0)
-            if (CubDebug(error = PtxVersion(ptx_version))) break;
-    #else
-            ptx_version = CUB_PTX_ARCH;
-    #endif
-
-            // Get kernel kernel dispatch configurations
-            KernelConfig device_select_sweep_config;
-            InitConfigs(ptx_version, device_select_sweep_config);
-
-            // Dispatch
-            if (CubDebug(error = Dispatch(
-                d_temp_storage,
-                temp_storage_bytes,
-                d_in,
-                d_flags,
-                d_selected_out,
-                d_num_selected_out,
-                select_op,
-                equality_op,
-                num_items,
-                stream,
-                debug_synchronous,
-                ptx_version,
-                DeviceScanInitKernel<Offset, ScanTileState>,
-                DeviceSelectSweepKernel<PtxRangeSelectPolicy, InputIterator, FlagsInputIterator, SelectedOutputIterator, NumSelectedIterator, ScanTileState, SelectOp, EqualityOp, Offset, KEEP_REJECTS>,
-                device_select_sweep_config))) break;
-        }
-        while (0);
-
-        return error;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh b/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
deleted file mode 100644
index eab5b518e..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh
+++ /dev/null
@@ -1,211 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-
-#pragma once
-
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-#include "../thread/thread_load.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
- */
-class GridBarrier
-{
-protected :
-
-    typedef unsigned int SyncFlag;
-
-    // Counters in global device memory
-    SyncFlag* d_sync;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrier() : d_sync(NULL) {}
-
-
-    /**
-     * Synchronize
-     */
-    __device__ __forceinline__ void Sync() const
-    {
-        volatile SyncFlag *d_vol_sync = d_sync;
-
-        // Threadfence and syncthreads to make sure global writes are visible before
-        // thread-0 reports in with its sync counter
-        __threadfence();
-        __syncthreads();
-
-        if (blockIdx.x == 0)
-        {
-            // Report in ourselves
-            if (threadIdx.x == 0)
-            {
-                d_vol_sync[blockIdx.x] = 1;
-            }
-
-            __syncthreads();
-
-            // Wait for everyone else to report in
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            __syncthreads();
-
-            // Let everyone know it's safe to proceed
-            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
-            {
-                d_vol_sync[peer_block] = 0;
-            }
-        }
-        else
-        {
-            if (threadIdx.x == 0)
-            {
-                // Report in
-                d_vol_sync[blockIdx.x] = 1;
-
-                // Wait for acknowledgment
-                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
-                {
-                    __threadfence_block();
-                }
-            }
-
-            __syncthreads();
-        }
-    }
-};
-
-
-/**
- * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
- *
- * Uses RAII for lifetime, i.e., device resources are reclaimed when
- * the destructor is called.
- */
-class GridBarrierLifetime : public GridBarrier
-{
-protected:
-
-    // Number of bytes backed by d_sync
-    size_t sync_bytes;
-
-public:
-
-    /**
-     * Constructor
-     */
-    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
-
-
-    /**
-     * DeviceFrees and resets the progress counters
-     */
-    cudaError_t HostReset()
-    {
-        cudaError_t retval = cudaSuccess;
-        if (d_sync)
-        {
-            CubDebug(retval = cudaFree(d_sync));
-            d_sync = NULL;
-        }
-        sync_bytes = 0;
-        return retval;
-    }
-
-
-    /**
-     * Destructor
-     */
-    virtual ~GridBarrierLifetime()
-    {
-        HostReset();
-    }
-
-
-    /**
-     * Sets up the progress counters for the next kernel launch (lazily
-     * allocating and initializing them if necessary)
-     */
-    cudaError_t Setup(int sweep_grid_size)
-    {
-        cudaError_t retval = cudaSuccess;
-        do {
-            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
-            if (new_sync_bytes > sync_bytes)
-            {
-                if (d_sync)
-                {
-                    if (CubDebug(retval = cudaFree(d_sync))) break;
-                }
-
-                sync_bytes = new_sync_bytes;
-
-                // Allocate and initialize to zero
-                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
-                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
-            }
-        } while (0);
-
-        return retval;
-    }
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh b/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
deleted file mode 100644
index a35563298..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh
+++ /dev/null
@@ -1,185 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
- */
-
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridEvenShare is a descriptor utility for distributing input among CUDA threadblocks in an "even-share" fashion.  Each threadblock gets roughly the same number of fixed-size work units (grains).
- *
- * \par Overview
- * GridEvenShare indicates which sections of input are to be mapped onto which threadblocks.
- * Threadblocks may receive one of three different amounts of work: "big", "normal",
- * and "last".  The "big" workloads are one scheduling grain larger than "normal".  The "last" work unit
- * for the last threadblock may be partially-full if the input is not an even multiple of
- * the scheduling grain size.
- *
- * \par
- * Before invoking a child grid, a parent thread will typically construct an instance of
- * GridEvenShare.  The instance can be passed to child threadblocks which can
- * initialize their per-threadblock offsets using \p BlockInit().
- *
- * \tparam Offset       Signed integer type for global offsets
- */
-template <typename Offset>
-struct GridEvenShare
-{
-    Offset      total_grains;
-    int         big_blocks;
-    Offset      big_share;
-    Offset      normal_share;
-    Offset      normal_base_offset;
-
-    /// Total number of input items
-    Offset      num_items;
-
-    /// Grid size in threadblocks
-    int         grid_size;
-
-    /// Offset into input marking the beginning of the owning thread block's segment of input tiles
-    Offset      block_offset;
-
-    /// Offset into input of marking the end (one-past) of the owning thread block's segment of input tiles
-    Offset      block_end;
-
-    /**
-     * \brief Default constructor.  Zero-initializes block-specific fields.
-     */
-    __host__ __device__ __forceinline__ GridEvenShare() :
-        num_items(0),
-        grid_size(0),
-        block_offset(0),
-        block_end(0) {}
-
-    /**
-     * \brief Constructor.  Initializes the grid-specific members \p num_items and \p grid_size. To be called prior prior to kernel launch)
-     */
-    __host__ __device__ __forceinline__ GridEvenShare(
-        Offset   num_items,                 ///< Total number of input items
-        int     max_grid_size,              ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
-        int     schedule_granularity)       ///< Granularity by which the input can be parcelled into and distributed among threablocks.  Usually the thread block's native tile size (or a multiple thereof.
-    {
-        this->num_items             = num_items;
-        this->block_offset          = num_items;
-        this->block_end             = num_items;
-        this->total_grains          = (num_items + schedule_granularity - 1) / schedule_granularity;
-        this->grid_size             = CUB_MIN(total_grains, max_grid_size);
-        Offset grains_per_block     = total_grains / grid_size;
-        this->big_blocks            = total_grains - (grains_per_block * grid_size);        // leftover grains go to big blocks
-        this->normal_share          = grains_per_block * schedule_granularity;
-        this->normal_base_offset    = big_blocks * schedule_granularity;
-        this->big_share             = normal_share + schedule_granularity;
-    }
-
-
-
-    /**
-     * \brief Initializes ranges for the specified partition index
-     */
-    __device__ __forceinline__ void Init(int partition_id)
-    {
-        if (partition_id < big_blocks)
-        {
-            // This threadblock gets a big share of grains (grains_per_block + 1)
-            block_offset = (partition_id * big_share);
-            block_end = block_offset + big_share;
-        }
-        else if (partition_id < total_grains)
-        {
-            // This threadblock gets a normal share of grains (grains_per_block)
-            block_offset = normal_base_offset + (partition_id * normal_share);
-            block_end = CUB_MIN(num_items, block_offset + normal_share);
-        }
-    }
-
-
-    /**
-     * \brief Initializes ranges for the current thread block (e.g., to be called by each threadblock after startup)
-     */
-    __device__ __forceinline__ void BlockInit()
-    {
-        Init(blockIdx.x);
-    }
-
-
-    /**
-     * Print to stdout
-     */
-    __host__ __device__ __forceinline__ void Print()
-    {
-        printf(
-#if (CUB_PTX_ARCH > 0)
-            "\tthreadblock(%d) "
-            "block_offset(%lu) "
-            "block_end(%lu) "
-#endif
-            "num_items(%lu)  "
-            "total_grains(%lu)  "
-            "big_blocks(%lu)  "
-            "big_share(%lu)  "
-            "normal_share(%lu)\n",
-#if (CUB_PTX_ARCH > 0)
-                blockIdx.x,
-                (unsigned long) block_offset,
-                (unsigned long) block_end,
-#endif
-                (unsigned long) num_items,
-                (unsigned long) total_grains,
-                (unsigned long) big_blocks,
-                (unsigned long) big_share,
-                (unsigned long) normal_share);
-    }
-};
-
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh b/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
deleted file mode 100644
index ff6679b9b..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh
+++ /dev/null
@@ -1,95 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/******************************************************************************
- * Mapping policies
- *****************************************************************************/
-
-
-/**
- * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
- */
-enum GridMappingStrategy
-{
-    /**
-     * \brief An "even-share" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is evenly partitioned into \p p segments, where \p p is
-     * constant and corresponds loosely to the number of thread blocks that may
-     * actively reside on the target device. Each segment is comprised of
-     * consecutive tiles, where a tile is a small, constant-sized unit of input
-     * to be processed to completion before the thread block terminates or
-     * obtains more work.  The kernel invokes \p p thread blocks, each
-     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
-     * in tile-size increments.
-     */
-    GRID_MAPPING_EVEN_SHARE,
-
-    /**
-     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
-     *
-     * \par Overview
-     * The input is treated as a queue to be dynamically consumed by a grid of
-     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
-     * unit of input to be processed to completion before the thread block
-     * terminates or obtains more work.  The grid size \p p is constant,
-     * loosely corresponding to the number of thread blocks that may actively
-     * reside on the target device.
-     */
-    GRID_MAPPING_DYNAMIC,
-};
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh b/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
deleted file mode 100644
index 865661662..000000000
--- a/thrust/system/cuda/detail/cub/grid/grid_queue.cuh
+++ /dev/null
@@ -1,216 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::GridQueue is a descriptor utility for dynamic queue management.
- */
-
-#pragma once
-
-#include "../util_namespace.cuh"
-#include "../util_debug.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup GridModule
- * @{
- */
-
-
-/**
- * \brief GridQueue is a descriptor utility for dynamic queue management.
- *
- * \par Overview
- * GridQueue descriptors provides abstractions for "filling" or
- * "draining" globally-shared vectors.
- *
- * \par
- * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
- * returning a unique offset for the calling thread to write its items.
- * The GridQueue maintains the total "fill-size".  The fill counter must be reset
- * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
- * will be filling.
- *
- * \par
- * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
- * zero-initialized counter, returning a unique offset for the calling thread to
- * read its items. Threads can safely drain until the array's logical fill-size is
- * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
- * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
- * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
- * is simply the number of elements in the array.)
- *
- * \par
- * Iterative work management can be implemented simply with a pair of flip-flopping
- * work buffers, each with an associated set of fill and drain GridQueue descriptors.
- *
- * \tparam Offset Signed integer type for global offsets
- */
-template <typename Offset>
-class GridQueue
-{
-private:
-
-    /// Counter indices
-    enum
-    {
-        FILL    = 0,
-        DRAIN   = 1,
-    };
-
-    /// Pair of counters
-    Offset *d_counters;
-
-public:
-
-    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
-    __host__ __device__ __forceinline__
-    static size_t AllocationSize()
-    {
-        return sizeof(Offset) * 2;
-    }
-
-
-    /// Constructs an invalid GridQueue descriptor
-    __host__ __device__ __forceinline__ GridQueue()
-    :
-        d_counters(NULL)
-    {}
-
-
-    /// Constructs a GridQueue descriptor around the device storage allocation
-    __host__ __device__ __forceinline__ GridQueue(
-        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
-    :
-        d_counters((Offset*) d_storage)
-    {}
-
-
-    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
-        Offset fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        d_counters[FILL] = fill_size;
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        Offset counters[2];
-        counters[FILL] = fill_size;
-        counters[DRAIN] = 0;
-        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(Offset) * 2, cudaMemcpyHostToDevice, stream));
-#endif
-    }
-
-
-    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
-    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        d_counters[DRAIN] = 0;
-        return cudaSuccess;
-#else
-        return FillAndResetDrain(0, stream);
-#endif
-    }
-
-
-    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
-    __host__ __device__ __forceinline__ cudaError_t ResetFill()
-    {
-#if (CUB_PTX_ARCH > 0)
-        d_counters[FILL] = 0;
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemset(d_counters + FILL, 0, sizeof(Offset)));
-#endif
-    }
-
-
-    /// Returns the fill-size established by the parent or by the previous kernel.
-    __host__ __device__ __forceinline__ cudaError_t FillSize(
-        Offset &fill_size,
-        cudaStream_t stream = 0)
-    {
-#if (CUB_PTX_ARCH > 0)
-        fill_size = d_counters[FILL];
-        return cudaSuccess;
-#else
-        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(Offset), cudaMemcpyDeviceToHost, stream));
-#endif
-    }
-
-
-    /// Drain num_items.  Returns offset from which to read items.
-    __device__ __forceinline__ Offset Drain(Offset num_items)
-    {
-        return atomicAdd(d_counters + DRAIN, num_items);
-    }
-
-
-    /// Fill num_items.  Returns offset from which to write items.
-    __device__ __forceinline__ Offset Fill(Offset num_items)
-    {
-        return atomicAdd(d_counters + FILL, num_items);
-    }
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Reset grid queue (call with 1 block of 1 thread)
- */
-template <typename Offset>
-__global__ void FillAndResetDrainKernel(
-    GridQueue<Offset>    grid_queue,
-    Offset               num_items)
-{
-    grid_queue.FillAndResetDrain(num_items);
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group GridModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-
diff --git a/thrust/system/cuda/detail/cub/host/spinlock.cuh b/thrust/system/cuda/detail/cub/host/spinlock.cuh
deleted file mode 100644
index 6e4b47c7d..000000000
--- a/thrust/system/cuda/detail/cub/host/spinlock.cuh
+++ /dev/null
@@ -1,123 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple x86/x64 atomic spinlock, portable across MS Windows (cl.exe) & Linux (g++)
- */
-
-
-#pragma once
-
-#if defined(_WIN32) || defined(_WIN64)
-    #include <intrin.h>
-    #include <windows.h>
-    #undef small            // Windows is terrible for polluting macro namespace
-
-    /**
-     * Compiler read/write barrier
-     */
-    #pragma intrinsic(_ReadWriteBarrier)
-
-#endif
-
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-#if defined(_MSC_VER)
-
-    // Microsoft VC++
-    typedef long Spinlock;
-
-#else
-
-    // GNU g++
-    typedef int Spinlock;
-
-    /**
-     * Compiler read/write barrier
-     */
-    __forceinline__ void _ReadWriteBarrier()
-    {
-        __sync_synchronize();
-    }
-
-    /**
-     * Atomic exchange
-     */
-    __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
-    {
-        // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
-        _ReadWriteBarrier();
-        return __sync_lock_test_and_set(Target, Value);
-    }
-
-    /**
-     * Pause instruction to prevent excess processor bus usage
-     */
-    __forceinline__ void YieldProcessor()
-    {
-#ifndef __arm__
-        asm volatile("pause\n": : :"memory");
-#endif  // __arm__
-    }
-
-#endif  // defined(_MSC_VER)
-
-/**
- * Return when the specified spinlock has been acquired
- */
-__forceinline__ void Lock(volatile Spinlock *lock)
-{
-    while (1)
-    {
-        if (!_InterlockedExchange(lock, 1)) return;
-        while (*lock) YieldProcessor();
-    }
-}
-
-
-/**
- * Release the specified spinlock
- */
-__forceinline__ void Unlock(volatile Spinlock *lock)
-{
-    _ReadWriteBarrier();
-    *lock = 0;
-}
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
diff --git a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
deleted file mode 100644
index 03b842d43..000000000
--- a/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh
+++ /dev/null
@@ -1,255 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#include <thrust/version.h>
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p ItemOffsetPair tuples).
- *
- * \par Overview
- * - ArgIndexInputIterator wraps a random access input iterator \p itr of type \p InputIterator.
- *   Dereferencing an ArgIndexInputIterator at offset \p i produces a \p ItemOffsetPair value whose
- *   \p offset field is \p i and whose \p item field is <tt>itr[i]</tt>.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ArgIndexInputIterator to
- * dereference an array of doubles
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::ArgIndexInputIterator<double*> itr(d_in);
- *
- * // Within device code:
- * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
- * Tuple item_offset_pair.offset = *itr;
- * printf("%f @ %d\n",
- *  item_offset_pair.value,
- *  item_offset_pair.offset);   // 8.0 @ 0
- *
- * itr = itr + 6;
- * item_offset_pair.offset = *itr;
- * printf("%f @ %d\n",
- *  item_offset_pair.value,
- *  item_offset_pair.offset);   // 9.0 @ 6
- *
- * \endcode
- *
- * \tparam InputIterator        The type of the wrapped input iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    InputIterator,
-    typename    Offset = ptrdiff_t>
-class ArgIndexInputIterator
-{
-private:
-
-    // Data type of input iterator
-    typedef typename std::iterator_traits<InputIterator>::value_type T;
-
-public:
-
-
-    // Required iterator traits
-    typedef ArgIndexInputIterator               self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ItemOffsetPair<T, difference_type>  value_type;             ///< The type of the element the iterator can point to
-    typedef value_type*                         pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef value_type                          reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    InputIterator   itr;
-    difference_type offset;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ArgIndexInputIterator(
-        InputIterator   itr,            ///< Input iterator to wrap
-        difference_type offset = 0)     ///< Offset (in items) from \p itr denoting the position of the iterator
-    :
-        itr(itr),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        value_type retval;
-        retval.value = itr[offset];
-        retval.offset = offset;
-        return retval;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(itr, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(itr, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return *(*this + n);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((itr == rhs.itr) && (offset == rhs.offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((itr != rhs.itr) || (offset != rhs.offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
deleted file mode 100644
index 16ba3a4a9..000000000
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh
+++ /dev/null
@@ -1,240 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
- *
- * \par Overview
- * - CacheModifiedInputIterator is a random-access input iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by reading \p ValueType values through loads modified by \p MODIFIER.
- * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
- *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedInputIterator to
- * dereference a device array of double using the "ldg" PTX load modifier
- * (i.e., load values through texture cache).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 8.0
- * printf("%f\n", itr[1]);  // 6.0
- * printf("%f\n", itr[6]);  // 9.0
- *
- * \endcode
- *
- * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheLoadModifier   MODIFIER,
-    typename            ValueType,
-    typename            Offset = ptrdiff_t>
-class CacheModifiedInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedInputIterator          self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-
-private:
-
-    ValueType* ptr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
-        ValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(ptr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return ThreadLoad<MODIFIER>(ptr + n);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &ThreadLoad<MODIFIER>(ptr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
deleted file mode 100644
index 179ce146c..000000000
--- a/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh
+++ /dev/null
@@ -1,253 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
- *
- * \par Overview
- * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
- *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
- *   made by writing \p ValueType values through stores modified by \p MODIFIER.
- * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
- *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions, but can only be dereferenced within device functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
- * dereference a device array of doubles using the "wt" PTX load modifier
- * (i.e., write-through to system memory).
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * double *d_out;              // e.g., [, , , , , , ]
- *
- * // Create an iterator wrapper
- * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
- *
- * // Within device code:
- * itr[0]  = 8.0;
- * itr[1]  = 66.0;
- * itr[55] = 24.0;
- *
- * \endcode
- *
- * \par Usage Considerations
- * - Can only be dereferenced within device code
- *
- * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
- * \tparam ValueType            The value type of this iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            ValueType,
-    typename            Offset = ptrdiff_t>
-class CacheModifiedOutputIterator
-{
-private:
-
-    // Proxy object
-    struct Reference
-    {
-        ValueType* ptr;
-
-        /// Constructor
-        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
-
-        /// Assignment
-        __host__ __device__ __forceinline__ ValueType operator =(ValueType val)
-        {
-            ThreadStore<MODIFIER>(ptr, val);
-            return val;
-        }
-    };
-
-public:
-
-    // Required iterator traits
-    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType* ptr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
-        ValueType* ptr)     ///< Native pointer to wrap
-    :
-        ptr(ptr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        ptr++;
-        return retval;
-    }
-
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        ptr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return Reference(ptr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(ptr + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        ptr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(ptr - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        ptr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return ptr - other.ptr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return Reference(ptr + n);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (ptr == rhs.ptr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (ptr != rhs.ptr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
deleted file mode 100644
index 4c386a6b8..000000000
--- a/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh
+++ /dev/null
@@ -1,235 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of homogeneous values
- *
- * \par Overview
- * - Read references to a ConstantInputIterator iterator always return the supplied constant
- *   of type \p ValueType.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p ConstantInputIterator to
- * dereference a sequence of homogeneous doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
- *
- * cub::ConstantInputIterator<double> itr(5.0);
- *
- * printf("%f\n", itr[0]);      // 5.0
- * printf("%f\n", itr[1]);      // 5.0
- * printf("%f\n", itr[2]);      // 5.0
- * printf("%f\n", itr[50]);     // 5.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename Offset = ptrdiff_t>
-class ConstantInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef ConstantInputIterator               self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType   val;
-    Offset      offset;
-#ifdef _WIN32
-    Offset      pad[CUB_MAX(1, (16 / sizeof(Offset) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
-#endif
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ConstantInputIterator(
-        ValueType   val,            ///< Starting value for the iterator instance to report
-        Offset      offset = 0)     ///< Base offset
-    :
-        val(val),
-        offset(offset)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val, offset + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val, offset - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return offset - other.offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return val;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (offset == rhs.offset) && ((val == rhs.val));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (offset != rhs.offset) || (val!= rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "," << itr.offset << "]";
-        return os;
-    }
-
-};
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
deleted file mode 100644
index 7c6320f9f..000000000
--- a/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh
+++ /dev/null
@@ -1,228 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-/**
- * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
- *
- * \par Overview
- * - After initializing a CountingInputIterator to a certain integer \p base, read references
- *   at \p offset will return the value \p base + \p offset.
- * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
- *   functions.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p CountingInputIterator to
- * dereference a sequence of incrementing integers.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
- *
- * cub::CountingInputIterator<int> itr(5);
- *
- * printf("%d\n", itr[0]);      // 5
- * printf("%d\n", itr[1]);      // 6
- * printf("%d\n", itr[2]);      // 7
- * printf("%d\n", itr[50]);     // 55
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename ValueType,
-    typename Offset = ptrdiff_t>
-class CountingInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef CountingInputIterator               self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ValueType val;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ CountingInputIterator(
-        const ValueType &val)          ///< Starting value for the iterator instance to report
-    :
-        val(val)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        val++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        val++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return val;
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(val + n);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        val += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(val - n);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        val -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return val - other.val;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return val + n;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &val;
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (val == rhs.val);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (val != rhs.val);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        os << "[" << itr.val << "]";
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
deleted file mode 100644
index be5c79c1f..000000000
--- a/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh
+++ /dev/null
@@ -1,308 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
- *
- * \par Overview
- * - TexObjInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - With regard to nested/dynamic parallelism, TexObjInputIterator iterators may only be
- *   created by the host thread, but can be used by any descendant kernel.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIterator to
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexObjInputIterator<double> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    typename    Offset = ptrdiff_t>
-class TexObjInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexObjInputIterator                 self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    // Largest texture word we can use in device
-    typedef typename UnitWord<T>::TextureWord TextureWord;
-
-    // Number of texture words per T
-    enum {
-        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-    };
-
-private:
-
-    T*                  ptr;
-    difference_type     tex_offset;
-    cudaTextureObject_t tex_obj;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TexObjInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0),
-        tex_obj(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    cudaError_t BindTexture(
-        T               *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,              ///< Number of bytes in the range
-        size_t          tex_offset = 0)     ///< Offset (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = ptr;
-        this->tex_offset = tex_offset;
-
-        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
-        cudaResourceDesc        res_desc;
-        cudaTextureDesc         tex_desc;
-        memset(&res_desc, 0, sizeof(cudaResourceDesc));
-        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
-        res_desc.resType                = cudaResourceTypeLinear;
-        res_desc.res.linear.devPtr      = ptr;
-        res_desc.res.linear.desc        = channel_desc;
-        res_desc.res.linear.sizeInBytes = bytes;
-        tex_desc.readMode               = cudaReadModeElementType;
-        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return cudaDestroyTextureObject(tex_obj);
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Move array of uninitialized words, then alias and assign to return value
-        TextureWord words[TEXTURE_MULTIPLE];
-
-        #pragma unroll
-        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-        {
-            words[i] = tex1Dfetch<TextureWord>(
-                tex_obj,
-                (tex_offset * TEXTURE_MULTIPLE) + i);
-        }
-
-        // Load from words
-        return *reinterpret_cast<T*>(words);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr          = ptr;
-        retval.tex_obj      = tex_obj;
-        retval.tex_offset   = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return *(*this + n);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
deleted file mode 100644
index 3da53c609..000000000
--- a/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh
+++ /dev/null
@@ -1,370 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../util_namespace.cuh"
-
-#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
-
-#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/******************************************************************************
- * Static file-scope Tesla/Fermi-style texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Global texture reference specialized by type
-template <typename T>
-struct IteratorTexRef
-{
-    /// And by unique ID
-    template <int UNIQUE_ID>
-    struct TexId
-    {
-        // Largest texture word we can use in device
-        typedef typename UnitWord<T>::DeviceWord DeviceWord;
-        typedef typename UnitWord<T>::TextureWord TextureWord;
-
-        // Number of texture words per T
-        enum {
-            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
-            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-        };
-
-        // Texture reference type
-        typedef texture<TextureWord> TexRef;
-
-        // Texture reference
-        static TexRef ref;
-
-        /// Bind texture
-        static cudaError_t BindTexture(void *d_in)
-        {
-            if (d_in)
-            {
-                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
-                ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(NULL, ref, d_in)));
-            }
-
-            return cudaSuccess;
-        }
-
-        /// Unbind texture
-        static cudaError_t UnbindTexture()
-        {
-            return CubDebug(cudaUnbindTexture(ref));
-        }
-
-        /// Fetch element
-        template <typename Distance>
-        static __device__ __forceinline__ T Fetch(Distance tex_offset)
-        {
-            DeviceWord temp[DEVICE_MULTIPLE];
-            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
-
-            #pragma unroll
-            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-            {
-                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
-            }
-
-            return reinterpret_cast<T&>(temp);
-        }
-    };
-};
-
-// Texture reference definitions
-template <typename  T>
-template <int       UNIQUE_ID>
-typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
-
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-
-/**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
- *
- * \par Overview
- * - TexRefInputIterator wraps a native device pointer of type <tt>ValueType*</tt>. References
- *   to elements are to be loaded through texture cache.
- * - Can be used to load any data type from memory through texture cache.
- * - Can be manipulated and exchanged within and between host and device
- *   functions, can only be constructed within host functions, and can only be
- *   dereferenced within device functions.
- * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
- *   reference.  Only one TexRefInputIterator instance can be bound at any given time for a
- *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
- *   thread, and (4) compilation .o unit.
- * - With regard to nested/dynamic parallelism, TexRefInputIterator iterators may only be
- *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
- *   from the host).
- * - Compatible with Thrust API v1.7 or newer.
- * - Compatible with CUDA toolkit v5.5 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TexRefInputIterator to
- * dereference a device array of doubles through texture cache.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
- *
- * // Declare, allocate, and initialize a device array
- * int num_items;   // e.g., 7
- * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
- *
- * // Create an iterator wrapper
- * cub::TexRefInputIterator<double, __LINE__> itr;
- * itr.BindTexture(d_in, sizeof(double) * num_items);
- * ...
- *
- * // Within device code:
- * printf("%f\n", itr[0]);      // 8.0
- * printf("%f\n", itr[1]);      // 6.0
- * printf("%f\n", itr[6]);      // 9.0
- *
- * ...
- * itr.UnbindTexture();
- *
- * \endcode
- *
- * \tparam T                    The value type of this iterator
- * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- */
-template <
-    typename    T,
-    int         UNIQUE_ID,
-    typename    Offset = ptrdiff_t>
-class TexRefInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TexRefInputIterator                 self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::device_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    T*              ptr;
-    difference_type tex_offset;
-
-    // Texture reference wrapper (old Tesla/Fermi-style textures)
-    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TexRefInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0)
-    {}
-
-    /// Use this iterator to bind \p ptr with a texture reference
-    cudaError_t BindTexture(
-        T               *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,                  ///< Number of bytes in the range
-        size_t          tex_offset = 0)         ///< Offset (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = ptr;
-        this->tex_offset = (difference_type) tex_offset;
-        return TexId::BindTexture(ptr);
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return TexId::UnbindTexture();
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-#if (CUB_PTX_ARCH == 0)
-        // Simply dereference the pointer on the host
-        return ptr[tex_offset];
-#else
-        // Use the texture reference
-        return TexId::Fetch(tex_offset);
-#endif
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return *(*this + n);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
-
-#endif // CUDA_VERSION
diff --git a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh b/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
deleted file mode 100644
index 90ffbaad2..000000000
--- a/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh
+++ /dev/null
@@ -1,252 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Random-access iterator types
- */
-
-#pragma once
-
-#include <iterator>
-#include <iostream>
-
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_namespace.cuh"
-
-#if (THRUST_VERSION >= 100700)
-    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
-
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIterator
- * @{
- */
-
-
-/**
- * \brief A random-access input wrapper for transforming dereferenced values.
- *
- * \par Overview
- * - TransformInputIterator wraps a unary conversion functor of type \p
- *   ConversionOp and a random-access input iterator of type <tt>InputIterator</tt>,
- *   using the former to produce references of type \p ValueType from the latter.
- * - Can be used with any data type.
- * - Can be constructed, manipulated, and exchanged within and between host and device
- *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
- *   device memory can only be dereferenced on the device.
- * - Compatible with Thrust API v1.7 or newer.
- *
- * \par Snippet
- * The code snippet below illustrates the use of \p TransformInputIterator to
- * dereference an array of integers, tripling the values and converting them to doubles.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
- *
- * // Functor for tripling integer values and converting to doubles
- * struct TripleDoubler
- * {
- *     __host__ __device__ __forceinline__
- *     double operator()(const int &a) const {
- *         return double(a * 2);
- *     }
- * };
- *
- * // Declare, allocate, and initialize a device array
- * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
- * TripleDoubler conversion_op;
- *
- * // Create an iterator wrapper
- * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
- *
- * // Within device code:
- * printf("%f\n", itr[0]);  // 24.0
- * printf("%f\n", itr[1]);  // 18.0
- * printf("%f\n", itr[6]);  // 27.0
- *
- * \endcode
- *
- * \tparam ValueType            The value type of this iterator
- * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
- * \tparam InputIterator        The type of the wrapped input iterator
- * \tparam Offset               The difference type of this iterator (Default: \p ptrdiff_t)
- *
- */
-template <
-    typename ValueType,
-    typename ConversionOp,
-    typename InputIterator,
-    typename Offset = ptrdiff_t>
-class TransformInputIterator
-{
-public:
-
-    // Required iterator traits
-    typedef TransformInputIterator              self_type;              ///< My own type
-    typedef Offset                              difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
-    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename thrust::detail::iterator_facade_category<
-        thrust::any_system_tag,
-        thrust::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    ConversionOp  conversion_op;
-    InputIterator input_itr;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ TransformInputIterator(
-        InputIterator       input_itr,          ///< Input iterator to wrap
-        ConversionOp        conversion_op)      ///< Conversion functor to wrap
-    :
-        conversion_op(conversion_op),
-        input_itr(input_itr)
-    {}
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        input_itr++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        input_itr++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        return conversion_op(*input_itr);
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval(input_itr + n, conversion_op);
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        input_itr += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval(input_itr - n, conversion_op);
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        input_itr -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return input_itr - other.input_itr;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        return conversion_op(input_itr[n]);
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &conversion_op(*input_itr);
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return (input_itr == rhs.input_itr);
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return (input_itr != rhs.input_itr);
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
-    {
-        return os;
-    }
-};
-
-
-
-/** @} */       // end group UtilIterator
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_load.cuh b/thrust/system/cuda/detail/cub/thread/thread_load.cuh
deleted file mode 100644
index 8e3790f53..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_load.cuh
+++ /dev/null
@@ -1,444 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for reading memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include <iterator>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory load operations.
- */
-enum CacheLoadModifier
-{
-    LOAD_DEFAULT,       ///< Default (no modifier)
-    LOAD_CA,            ///< Cache at all levels
-    LOAD_CG,            ///< Cache at global level
-    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
-    LOAD_CV,            ///< Cache as volatile (including cached system lines)
-    LOAD_LDG,           ///< Cache as texture
-    LOAD_VOLATILE,      ///< Volatile (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
- *
- * // 32-bit load using cache-global modifier:
- * int *d_in;
- * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
- *
- * // 16-bit load using default modifier
- * short *d_in;
- * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
- *
- * // 256-bit load using cache-volatile modifier
- * double4 *d_in;
- * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
- *
- * // 96-bit load using cache-streaming modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
- * \tparam InputIterator        <b>[inferred]</b> Input iterator type \iterator
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIterator>
-__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad(InputIterator itr);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated load iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadLoad
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T *ptr, T *vals)
-    {
-        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
-        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
-    }
-
-    template <typename InputIterator, typename T>
-    static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals)
-    {
-        vals[COUNT] = ptr[COUNT];
-        IterateThreadLoad<COUNT + 1, MAX>::Dereference(ptr, vals);
-    }
-};
-
-
-/// Helper structure for templated load iteration (termination case)
-template <int MAX>
-struct IterateThreadLoad<MAX, MAX>
-{
-    template <CacheLoadModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Load(T *ptr, T *vals) {}
-
-    template <typename InputIterator, typename T>
-    static __device__ __forceinline__ void Dereference(InputIterator ptr, T *vals) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
- */
-#define CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4*>(uint4* ptr)           \
-    {                                                                                       \
-        uint4 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y),                                                                 \
-            "=r"(retval.z),                                                                 \
-            "=r"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2*>(ulonglong2* ptr)              \
-    {                                                                                       \
-        ulonglong2 retval;                                                                  \
-        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
-            "=l"(retval.x),                                                                 \
-            "=l"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
- */
-#define CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4*>(ushort4* ptr)     \
-    {                                                                                       \
-        ushort4 retval;                                                                     \
-        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
-            "=h"(retval.x),                                                                 \
-            "=h"(retval.y),                                                                 \
-            "=h"(retval.z),                                                                 \
-            "=h"(retval.w) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2*>(uint2* ptr)           \
-    {                                                                                       \
-        uint2 retval;                                                                       \
-        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
-            "=r"(retval.x),                                                                 \
-            "=r"(retval.y) :                                                                \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long*>(unsigned long long* ptr)                 \
-    {                                                                                       \
-        unsigned long long retval;                                                          \
-        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
-            "=l"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-/**
- * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
- */
-#define CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int*>(unsigned int* ptr)                 \
-    {                                                                                       \
-        unsigned int retval;                                                                \
-        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
-            "=r"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
- */
-#define CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short*>(unsigned short* ptr)           \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return retval;                                                                      \
-    }
-
-
-/**
- * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
- */
-#define CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
-    template<>                                                                              \
-    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char*>(unsigned char* ptr)              \
-    {                                                                                       \
-        unsigned short retval;                                                              \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
-        "    cvt.u16.u8 %0, datum;"                                                         \
-        "}" :                                                                               \
-            "=h"(retval) :                                                                  \
-            _CUB_ASM_PTR_(ptr));                                                            \
-        return (unsigned char) retval;                                                               \
-    }
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
- */
-#define CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
-    CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
-    CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
-    CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
-
-
-/**
- * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    CUB_LOAD_ALL(LOAD_CA, ca)
-    CUB_LOAD_ALL(LOAD_CG, cg)
-    CUB_LOAD_ALL(LOAD_CS, cs)
-    CUB_LOAD_ALL(LOAD_CV, cv)
-#else
-    CUB_LOAD_ALL(LOAD_CA, global)
-    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
-    CUB_LOAD_ALL(LOAD_CG, volatile.global)
-    CUB_LOAD_ALL(LOAD_CS, global)
-    CUB_LOAD_ALL(LOAD_CV, volatile.global)
-#endif
-
-#if CUB_PTX_ARCH >= 350
-    CUB_LOAD_ALL(LOAD_LDG, global.nc)
-#else
-    CUB_LOAD_ALL(LOAD_LDG, global)
-#endif
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
- */
-template <typename InputIterator>
-__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad(
-    InputIterator           itr,
-    Int2Type<LOAD_DEFAULT>  modifier,
-    Int2Type<false>         is_pointer)
-{
-    return *itr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_DEFAULT>  modifier,
-    Int2Type<true>          is_pointer)
-{
-    return *ptr;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<true>          is_primitive)
-{
-    T retval = *reinterpret_cast<volatile T*>(ptr);
-
-#if (CUB_PTX_ARCH <= 130)
-    if (sizeof(T) == 1) __threadfence_block();
-#endif
-
-    return retval;
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoadVolatilePointer(
-    T                       *ptr,
-    Int2Type<false>          is_primitive)
-{
-
-#if CUB_PTX_ARCH <= 130
-
-    T retval = *ptr;
-    __threadfence_block();
-    return retval;
-
-#else
-
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-/*
-    VolatileWord words[VOLATILE_MULTIPLE];
-
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-*/
-
-    T retval;
-    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
-    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-    return retval;
-
-#endif  // CUB_PTX_ARCH <= 130
-}
-
-
-/**
- * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<LOAD_VOLATILE> modifier,
-    Int2Type<true>          is_pointer)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ T ThreadLoad(
-    T                       *ptr,
-    Int2Type<MODIFIER>      modifier,
-    Int2Type<true>          is_pointer)
-{
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words);
-
-    return *reinterpret_cast<T*>(words);
-}
-
-
-/**
- * ThreadLoad definition for generic modifiers
- */
-template <
-    CacheLoadModifier MODIFIER,
-    typename InputIterator>
-__device__ __forceinline__ typename std::iterator_traits<InputIterator>::value_type ThreadLoad(InputIterator itr)
-{
-    // Apply tags for partial-specialization
-    return ThreadLoad(
-        itr,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<InputIterator>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh b/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
deleted file mode 100644
index d03ec0085..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_operators.cuh
+++ /dev/null
@@ -1,252 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Simple binary operator functor types
- */
-
-/******************************************************************************
- * Simple functor operators
- ******************************************************************************/
-
-#pragma once
-
-#include "../util_macro.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \brief Default equality functor
- */
-struct Equality
-{
-    /// Boolean equality operator, returns <tt>(a == b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a == b;
-    }
-};
-
-
-/**
- * \brief Default inequality functor
- */
-struct Inequality
-{
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return a != b;
-    }
-};
-
-
-/**
- * \brief Inequality functor (wraps equality functor)
- */
-template <typename EqualityOp>
-struct InequalityWrapper
-{
-    /// Wrapped equality operator
-    EqualityOp op;
-
-    /// Constructor
-    __host__ __device__ __forceinline__
-    InequalityWrapper(EqualityOp op) : op(op) {}
-
-    /// Boolean inequality operator, returns <tt>(a != b)</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
-    {
-        return !op(a, b);
-    }
-};
-
-
-/**
- * \brief Default sum functor
- */
-struct Sum
-{
-    /// Boolean sum operator, returns <tt>a + b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return a + b;
-    }
-};
-
-
-/**
- * \brief Default max functor
- */
-struct Max
-{
-    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MAX(a, b);
-    }
-};
-
-
-/**
- * \brief Arg max functor (keeps the value and offset of the first occurrence of the l item)
- */
-struct ArgMax
-{
-    /// Boolean max operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename Offset>
-    __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()(
-        const ItemOffsetPair<T, Offset> &a,
-        const ItemOffsetPair<T, Offset> &b) const
-    {
-        if (a.value == b.value)
-            return (b.offset < a.offset) ? b : a;
-
-        return (b.value > a.value) ? b : a;
-    }
-};
-
-
-/**
- * \brief Default min functor
- */
-struct Min
-{
-    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
-    template <typename T>
-    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
-    {
-        return CUB_MIN(a, b);
-    }
-};
-
-
-/**
- * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
- */
-struct ArgMin
-{
-    /// Boolean min operator, preferring the item having the smaller offset in case of ties
-    template <typename T, typename Offset>
-    __host__ __device__ __forceinline__ ItemOffsetPair<T, Offset> operator()(
-        const ItemOffsetPair<T, Offset> &a,
-        const ItemOffsetPair<T, Offset> &b) const
-    {
-        if (a.value == b.value)
-            return (b.offset < a.offset) ? b : a;
-
-        return (b.value < a.value) ? b : a;
-    }
-};
-
-
-/**
- * \brief Default cast functor
- */
-template <typename B>
-struct Cast
-{
-    /// Cast operator, returns <tt>(B) a</tt>
-    template <typename A>
-    __host__ __device__ __forceinline__ B operator()(const A &a) const
-    {
-        return (B) a;
-    }
-};
-
-
-/**
- * \brief Reduce-by-segment functor.
- *
- * Given two cub::ItemOffsetPair inputs \p a and \p b and a
- * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
- * an instance of this functor returns a cub::ItemOffsetPair whose \p offset
- * field is <tt>a.offset</tt> + <tt>a.offset</tt>, and whose \p value field
- * is either b.value if b.offset is non-zero, or f(a.value, b.value) otherwise.
- *
- * ReduceBySegmentOp is an associative, non-commutative binary combining operator
- * for input sequences of cub::ItemOffsetPair pairings.  Such
- * sequences are typically used to represent a segmented set of values to be reduced
- * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
- * first value of each segment.
- *
- */
-template <
-    typename ReductionOp,                           ///< Binary reduction operator to apply to values
-    typename ItemOffsetPair>                        ///< ItemOffsetPair pairing of T (value) and Offset (head flag)
-class ReduceBySegmentOp
-{
-private:
-
-    /// Wrapped reduction operator
-    ReductionOp op;
-
-public:
-
-    /// Constructor
-    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOp op) : op(op) {}
-
-    /// Scan operator
-    __host__ __device__ __forceinline__ ItemOffsetPair operator()(
-        const ItemOffsetPair &first,       ///< First partial reduction
-        const ItemOffsetPair &second)      ///< Second partial reduction
-    {
-        // This expression uses less registers and is faster when compiled with Open64
-        ItemOffsetPair retval;
-        retval.offset = first.offset + second.offset;
-        retval.value = (second.offset) ?
-                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
-                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
-        return retval;
-    }
-};
-
-
-/** @} */       // end group UtilModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh b/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
deleted file mode 100644
index 29bc8ce0c..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh
+++ /dev/null
@@ -1,169 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential reduction over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \name Sequential reduction over statically-sized array types
- * @{
- */
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<LENGTH>    length)
-{
-    T addend = *input;
-    prefix = reduction_op(prefix, addend);
-
-    return ThreadReduce(input + 1, reduction_op, prefix, Int2Type<LENGTH - 1>());
-}
-
-template <
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*                  input,                  ///< [in] Input array
-    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
-    T                   prefix,                 ///< [in] Prefix to seed reduction with
-    Int2Type<0>         length)
-{
-    return prefix;
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
-}
-
-
-/**
- * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T*          input,                  ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    T prefix = input[0];
-    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
-}
-
-
-/**
- * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op,           ///< [in] Binary reduction operator
-    T           prefix)                 ///< [in] Prefix to seed reduction with
-{
-    return ThreadReduce<LENGTH>(input, reduction_op, prefix);
-}
-
-
-/**
- * \brief Serial reduction with the specified operator
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input array
- * \tparam T          <b>[inferred]</b> The data type to be reduced.
- * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ReductionOp>
-__device__ __forceinline__ T ThreadReduce(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    ReductionOp reduction_op)           ///< [in] Binary reduction operator
-{
-    return ThreadReduce<LENGTH>((T*) input, reduction_op);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh b/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
deleted file mode 100644
index 6276bf83b..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_scan.cuh
+++ /dev/null
@@ -1,283 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for sequential prefix scan over statically-sized array types
- */
-
-#pragma once
-
-#include "../thread/thread_operators.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * \name Sequential prefix scan over statically-sized array types
- * @{
- */
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    length)
-{
-    T addend = *input;
-    inclusive = scan_op(exclusive, addend);
-    *output = exclusive;
-    exclusive = inclusive;
-
-    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-template <
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T                   inclusive,
-    T                   exclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<0>         length)
-{
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = prefix;
-    T exclusive = inclusive;
-
-    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanExclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-
-
-
-
-
-
-
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<LENGTH>    length)
-{
-    T addend = *input;
-    inclusive = scan_op(inclusive, addend);
-    output[0] = inclusive;
-
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-template <
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T                   inclusive,
-    T                   *input,                 ///< [in] Input array
-    T                   *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp              scan_op,                ///< [in] Binary scan operator
-    Int2Type<0>         length)
-{
-    return inclusive;
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    T inclusive = input[0];
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op)                ///< [in] Binary scan operator
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           *input,                 ///< [in] Input array
-    T           *output,                ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    T inclusive = input[0];
-    if (apply_prefix)
-    {
-        inclusive = scan_op(prefix, inclusive);
-    }
-    output[0] = inclusive;
-
-    // Continue scan
-    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
-}
-
-
-/**
- * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
- *
- * \tparam LENGTH     <b>[inferred]</b> Length of \p input and \p output arrays
- * \tparam T          <b>[inferred]</b> The data type to be scanned.
- * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
- */
-template <
-    int         LENGTH,
-    typename    T,
-    typename    ScanOp>
-__device__ __forceinline__ T ThreadScanInclusive(
-    T           (&input)[LENGTH],       ///< [in] Input array
-    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
-    ScanOp      scan_op,                ///< [in] Binary scan operator
-    T           prefix,                 ///< [in] Prefix to seed scan with
-    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
-{
-    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
-}
-
-
-//@}  end member group
-
-/** @} */       // end group UtilModule
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/thread/thread_store.cuh b/thrust/system/cuda/detail/cub/thread/thread_store.cuh
deleted file mode 100644
index 6d036d42e..000000000
--- a/thrust/system/cuda/detail/cub/thread/thread_store.cuh
+++ /dev/null
@@ -1,414 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Thread utilities for writing memory using PTX cache modifiers.
- */
-
-#pragma once
-
-#include <cuda.h>
-
-#include "../util_ptx.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup UtilIo
- * @{
- */
-
-
-//-----------------------------------------------------------------------------
-// Tags and constants
-//-----------------------------------------------------------------------------
-
-/**
- * \brief Enumeration of cache modifiers for memory store operations.
- */
-enum CacheStoreModifier
-{
-    STORE_DEFAULT,              ///< Default (no modifier)
-    STORE_WB,                   ///< Cache write-back all coherent levels
-    STORE_CG,                   ///< Cache at global level
-    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
-    STORE_WT,                   ///< Cache write-through (to system memory)
-    STORE_VOLATILE,             ///< Volatile shared (any memory space)
-};
-
-
-/**
- * \name Thread I/O (cache modified)
- * @{
- */
-
-/**
- * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
- *
- * \par Example
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
- *
- * // 32-bit store using cache-global modifier:
- * int *d_out;
- * int val;
- * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
- *
- * // 16-bit store using default modifier
- * short *d_out;
- * short val;
- * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
- *
- * // 256-bit store using write-through modifier
- * double4 *d_out;
- * double4 val;
- * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
- *
- * // 96-bit store using cache-streaming cache modifier
- * struct TestFoo { bool a; short b; };
- * TestFoo *d_struct;
- * TestFoo val;
- * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
- * \endcode
- *
- * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
- * \tparam InputIterator        <b>[inferred]</b> Output iterator type \iterator
- * \tparam T                    <b>[inferred]</b> Data type of output value
- */
-template <
-    CacheStoreModifier  MODIFIER,
-    typename            OutputIterator,
-    typename            T>
-__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val);
-
-
-//@}  end member group
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/// Helper structure for templated store iteration (inductive case)
-template <int COUNT, int MAX>
-struct IterateThreadStore
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals)
-    {
-        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
-        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
-    }
-
-    template <typename OutputIterator, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals)
-    {
-        ptr[COUNT] = vals[COUNT];
-        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
-    }
-
-};
-
-/// Helper structure for templated store iteration (termination case)
-template <int MAX>
-struct IterateThreadStore<MAX, MAX>
-{
-    template <CacheStoreModifier MODIFIER, typename T>
-    static __device__ __forceinline__ void Store(T *ptr, T *vals) {}
-
-    template <typename OutputIterator, typename T>
-    static __device__ __forceinline__ void Dereference(OutputIterator ptr, T *vals) {}
-};
-
-
-/**
- * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
- */
-#define CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y),                                                                     \
-            "r"(val.z),                                                                     \
-            "r"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val.x),                                                                     \
-            "l"(val.y));                                                                    \
-    }
-
-
-/**
- * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
- */
-#define CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val.x),                                                                     \
-            "h"(val.y),                                                                     \
-            "h"(val.z),                                                                     \
-            "h"(val.w));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val.x),                                                                     \
-            "r"(val.y));                                                                    \
-    }                                                                                       \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "l"(val));                                                                      \
-    }
-
-/**
- * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
- */
-#define CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "r"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
- */
-#define CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
-    {                                                                                       \
-        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"(val));                                                                      \
-    }
-
-
-/**
- * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
- */
-#define CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
-    template<>                                                                              \
-    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
-    {                                                                                       \
-        asm volatile (                                                                      \
-        "{"                                                                                 \
-        "   .reg .u8 datum;"                                                                \
-        "   cvt.u8.u16 datum, %1;"                                                          \
-        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
-        "}" : :                                                                             \
-            _CUB_ASM_PTR_(ptr),                                                             \
-            "h"((unsigned short) val));                                                               \
-    }
-
-/**
- * Define powers-of-two ThreadStore specializations for the given Cache load modifier
- */
-#define CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
-    CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
-    CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
-    CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
-
-
-/**
- * Define ThreadStore specializations for the various Cache load modifiers
- */
-#if CUB_PTX_ARCH >= 200
-    CUB_STORE_ALL(STORE_WB, ca)
-    CUB_STORE_ALL(STORE_CG, cg)
-    CUB_STORE_ALL(STORE_CS, cs)
-    CUB_STORE_ALL(STORE_WT, wt)
-#else
-    CUB_STORE_ALL(STORE_WB, global)
-    CUB_STORE_ALL(STORE_CG, global)
-    CUB_STORE_ALL(STORE_CS, global)
-    CUB_STORE_ALL(STORE_WT, volatile.global)
-#endif
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on iterator types
- */
-template <typename OutputIterator, typename T>
-__device__ __forceinline__ void ThreadStore(
-    OutputIterator              itr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     modifier,
-    Int2Type<false>             is_pointer)
-{
-    *itr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_DEFAULT modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_DEFAULT>     modifier,
-    Int2Type<true>              is_pointer)
-{
-    *ptr = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<true>              is_primitive)
-{
-    *reinterpret_cast<volatile T*>(ptr) = val;
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStoreVolatilePtr(
-    T                           *ptr,
-    T                           val,
-    Int2Type<false>             is_primitive)
-{
-#if CUB_PTX_ARCH <= 130
-
-    *ptr = val;
-    __threadfence_block();
-
-#else
-
-    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
-
-    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
-
-    VolatileWord words[VOLATILE_MULTIPLE];
-    *reinterpret_cast<T*>(words) = val;
-
-//    VolatileWord *words = reinterpret_cast<VolatileWord*>(&val);
-
-    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
-        reinterpret_cast<volatile VolatileWord*>(ptr),
-        words);
-
-#endif  // CUB_PTX_ARCH <= 130
-
-}
-
-
-/**
- * ThreadStore definition for STORE_VOLATILE modifier on pointer types
- */
-template <typename T>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<STORE_VOLATILE>    modifier,
-    Int2Type<true>              is_pointer)
-{
-    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
-}
-
-
-/**
- * ThreadStore definition for generic modifiers on pointer types
- */
-template <typename T, int MODIFIER>
-__device__ __forceinline__ void ThreadStore(
-    T                           *ptr,
-    T                           val,
-    Int2Type<MODIFIER>          modifier,
-    Int2Type<true>              is_pointer)
-{
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;   // Word type for memcopying
-
-    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
-
-    DeviceWord words[DEVICE_MULTIPLE];
-
-    *reinterpret_cast<T*>(words) = val;
-
-    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
-        reinterpret_cast<DeviceWord*>(ptr),
-        words);
-}
-
-
-/**
- * ThreadStore definition for generic modifiers
- */
-template <CacheStoreModifier MODIFIER, typename OutputIterator, typename T>
-__device__ __forceinline__ void ThreadStore(OutputIterator itr, T val)
-{
-    ThreadStore(
-        itr,
-        val,
-        Int2Type<MODIFIER>(),
-        Int2Type<IsPointer<OutputIterator>::VALUE>());
-}
-
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/** @} */       // end group UtilIo
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_allocator.cuh b/thrust/system/cuda/detail/cub/util_allocator.cuh
deleted file mode 100644
index 3d0e8b745..000000000
--- a/thrust/system/cuda/detail/cub/util_allocator.cuh
+++ /dev/null
@@ -1,702 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Simple caching allocator for device memory allocations. The allocator is
- * thread-safe and capable of managing device allocations on multiple devices.
- ******************************************************************************/
-
-#pragma once
-
-#if (CUB_PTX_ARCH == 0)
-    #include <set>              // NVCC (EDG, really) takes FOREVER to compile std::map
-    #include <map>
-#endif
-
-#include <math.h>
-
-#include "util_namespace.cuh"
-#include "util_debug.cuh"
-
-#include "host/spinlock.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/******************************************************************************
- * CachingDeviceAllocator (host use)
- ******************************************************************************/
-
-/**
- * \brief A simple caching allocator for device memory allocations.
- *
- * \par Overview
- * The allocator is thread-safe and stream-safe and is capable of managing cached
- * device allocations on multiple devices.  It behaves as follows:
- *
- * \par
- * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
- *   the allocation becomes available immediately for reuse within the \p active_stream
- *   with which it was associated with during allocation, and it becomes available for
- *   reuse within other streams when all prior work submitted to \p active_stream has completed.
- * - Allocations are categorized and cached by bin size.  A new allocation request of
- *   a given size will only consider cached allocations within the corresponding bin.
- * - Bin limits progress geometrically in accordance with the growth factor
- *   \p bin_growth provided during construction.  Unused device allocations within
- *   a larger bin cache are not reused for allocation requests that categorize to
- *   smaller bin sizes.
- * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
- *   (\p bin_growth ^ \p min_bin).
- * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
- *   bin and are simply freed when they are deallocated instead of being returned
- *   to a bin-cache.
- * - %If the total storage of cached allocations on a given device will exceed
- *   \p max_cached_bytes, allocations for that device are simply freed when they are
- *   deallocated instead of being returned to their bin-cache.
- *
- * \par
- * For example, the default-constructed CachingDeviceAllocator is configured with:
- * - \p bin_growth = 8
- * - \p min_bin = 3
- * - \p max_bin = 7
- * - \p max_cached_bytes = 6MB - 1B
- *
- * \par
- * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
- * and sets a maximum of 6,291,455 cached bytes per device
- *
- */
-struct CachingDeviceAllocator
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-    //---------------------------------------------------------------------
-    // Type definitions and constants
-    //---------------------------------------------------------------------
-
-    enum
-    {
-        /// Invalid device ordinal
-        INVALID_DEVICE_ORDINAL  = -1,
-    };
-
-    /**
-     * Integer pow function for unsigned base and exponent
-     */
-    static unsigned int IntPow(
-        unsigned int base,
-        unsigned int exp)
-    {
-        unsigned int retval = 1;
-        while (exp > 0)
-        {
-            if (exp & 1) {
-                retval = retval * base;        // multiply the result by the current base
-            }
-            base = base * base;                // square the base
-            exp = exp >> 1;                    // divide the exponent in half
-        }
-        return retval;
-    }
-
-
-    /**
-     * Round up to the nearest power-of
-     */
-    static void NearestPowerOf(
-        unsigned int &power,
-        size_t &rounded_bytes,
-        unsigned int base,
-        size_t value)
-    {
-        power = 0;
-        rounded_bytes = 1;
-
-        while (rounded_bytes < value)
-        {
-            rounded_bytes *= base;
-            power++;
-        }
-    }
-
-    /**
-     * Descriptor for device memory allocations
-     */
-    struct BlockDescriptor
-    {
-        int             device;             // device ordinal
-        void*           d_ptr;              // Device pointer
-        cudaStream_t    associated_stream;  // Associated associated_stream
-        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
-        size_t          bytes;              // Size of allocation in bytes
-        unsigned int    bin;                // Bin enumeration
-
-        // Constructor
-        BlockDescriptor(void *d_ptr, int device) :
-            d_ptr(d_ptr),
-            bytes(0),
-            bin(0),
-            device(device),
-            associated_stream(0),
-            ready_event(0)
-        {}
-
-        // Constructor
-        BlockDescriptor(size_t bytes, unsigned int bin, int device, cudaStream_t associated_stream) :
-            d_ptr(NULL),
-            bytes(bytes),
-            bin(bin),
-            device(device),
-            associated_stream(associated_stream),
-            ready_event(0)
-        {}
-
-        // Comparison functor for comparing device pointers
-        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.d_ptr < b.d_ptr);
-            else
-                return (a.device < b.device);
-        }
-
-        // Comparison functor for comparing allocation sizes
-        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
-        {
-            if (a.device == b.device)
-                return (a.bytes < b.bytes);
-            else
-                return (a.device < b.device);
-        }
-    };
-
-    /// BlockDescriptor comparator function interface
-    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
-
-#if (CUB_PTX_ARCH == 0)   // Only define STL container members in host code
-
-    /// Set type for cached blocks (ordered by size)
-    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
-
-    /// Set type for live blocks (ordered by ptr)
-    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
-
-    /// Map type of device ordinals to the number of cached bytes cached by each device
-    typedef std::map<int, size_t> GpuCachedBytes;
-
-#endif // CUB_PTX_ARCH
-
-    //---------------------------------------------------------------------
-    // Fields
-    //---------------------------------------------------------------------
-
-    Spinlock        spin_lock;          /// Spinlock for thread-safety
-
-    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
-    unsigned int    min_bin;            /// Minimum bin enumeration
-    unsigned int    max_bin;            /// Maximum bin enumeration
-
-    size_t          min_bin_bytes;      /// Minimum bin size
-    size_t          max_bin_bytes;      /// Maximum bin size
-    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
-
-    bool            debug;              /// Whether or not to print (de)allocation events to stdout
-    bool            skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
-
-#if (CUB_PTX_ARCH == 0)   // Only define STL container members in host code
-
-    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
-    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
-    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
-
-#endif // CUB_PTX_ARCH
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-    //---------------------------------------------------------------------
-    // Methods
-    //---------------------------------------------------------------------
-
-    /**
-     * \brief Constructor.
-     */
-    CachingDeviceAllocator(
-        unsigned int    bin_growth,             ///< Geometric growth factor for bin-sizes
-        unsigned int    min_bin,                ///< Minimum bin
-        unsigned int    max_bin,                ///< Maximum bin
-        size_t          max_cached_bytes,       ///< Maximum aggregate cached bytes per device
-        bool            skip_cleanup = false)   ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called.  (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.)
-    :
-    #if (CUB_PTX_ARCH == 0)   // Only define STL container members in host code
-            cached_blocks(BlockDescriptor::SizeCompare),
-            live_blocks(BlockDescriptor::PtrCompare),
-    #endif
-            debug(false),
-            spin_lock(0),
-            bin_growth(bin_growth),
-            min_bin(min_bin),
-            max_bin(max_bin),
-            min_bin_bytes(IntPow(bin_growth, min_bin)),
-            max_bin_bytes(IntPow(bin_growth, max_bin)),
-            max_cached_bytes(max_cached_bytes)
-    {}
-
-
-    /**
-     * \brief Default constructor.
-     *
-     * Configured with:
-     * \par
-     * - \p bin_growth = 8
-     * - \p min_bin = 3
-     * - \p max_bin = 7
-     * - \p max_cached_bytes = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
-     *
-     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
-     * sets a maximum of 6,291,455 cached bytes per device
-     */
-    CachingDeviceAllocator(
-        bool skip_cleanup = false)  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called.  (Useful for preventing warnings when the allocator is declared at file/static/global scope: by the time the destructor is called on program exit, the CUDA runtime may have already shut down and freed all allocations.)
-    :
-    #if (CUB_PTX_ARCH == 0)   // Only define STL container members in host code
-        cached_blocks(BlockDescriptor::SizeCompare),
-        live_blocks(BlockDescriptor::PtrCompare),
-    #endif
-        skip_cleanup(skip_cleanup),
-        debug(false),
-        spin_lock(0),
-        bin_growth(8),
-        min_bin(3),
-        max_bin(7),
-        min_bin_bytes(IntPow(bin_growth, min_bin)),
-        max_bin_bytes(IntPow(bin_growth, max_bin)),
-        max_cached_bytes((max_bin_bytes * 3) - 1)
-    {}
-
-
-    /**
-     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
-     */
-    cudaError_t SetMaxCachedBytes(
-        size_t max_cached_bytes)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        // Lock
-        Lock(&spin_lock);
-
-        this->max_cached_bytes = max_cached_bytes;
-
-        if (debug) CubLog("New max_cached_bytes(%lld)\n", (long long) max_cached_bytes);
-
-        // Unlock
-        Unlock(&spin_lock);
-
-        return cudaSuccess;
-
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        int             device,             ///< [in] Device on which to place the allocation
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        *d_ptr                          = NULL;
-        bool locked                     = false;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        do {
-
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            if (device == INVALID_DEVICE_ORDINAL)
-                device = entrypoint_device;
-
-            // Round up to nearest bin size
-            unsigned int bin;
-            size_t bin_bytes;
-            NearestPowerOf(bin, bin_bytes, bin_growth, bytes);
-            if (bin < min_bin) {
-                bin = min_bin;
-                bin_bytes = min_bin_bytes;
-            }
-
-            // Check if bin is greater than our maximum bin
-            if (bin > max_bin)
-            {
-                // Allocate the request exactly and give out-of-range bin
-                bin = (unsigned int) -1;
-                bin_bytes = bytes;
-            }
-
-            BlockDescriptor search_key(bin_bytes, bin, device, active_stream);
-
-            // Lock
-            if (!locked) {
-                Lock(&spin_lock);
-                locked = true;
-            }
-
-            // Find the range of freed blocks big enough within the same bin on the same device
-            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
-
-            // Look for freed blocks from the active stream or from other idle streams
-            bool found = false;
-            while ((block_itr != cached_blocks.end()) &&
-                (block_itr->device == device) &&
-                (block_itr->bin == search_key.bin))
-            {
-                cudaStream_t prev_stream = block_itr->associated_stream;
-                if ((active_stream == prev_stream) || (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
-                {
-                    // Reuse existing cache block.  Insert into live blocks.
-                    found = true;
-                    search_key = *block_itr;
-                    search_key.associated_stream = active_stream;
-                    live_blocks.insert(search_key);
-
-                    // Remove from free blocks
-                    cached_blocks.erase(block_itr);
-                    cached_bytes[device] -= search_key.bytes;
-
-                    if (debug) CubLog("\tdevice %d reused cached block for stream %lld (%lld bytes, previously associated with stream %lld).\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                        device, (long long) active_stream, (long long) search_key.bytes, (long long) prev_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-
-                    break;
-                }
-
-                block_itr++;
-            }
-
-            if (!found)
-            {
-                // Need to allocate a new cache block. Unlock.
-                if (locked) {
-                    Unlock(&spin_lock);
-                    locked = false;
-                }
-
-                // Set to specified device
-                if (device != entrypoint_device) {
-                    if (CubDebug(error = cudaSetDevice(device))) break;
-                }
-
-                // Allocate
-                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) break;
-                if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming))) break;
-
-                // Lock
-                if (!locked) {
-                    Lock(&spin_lock);
-                    locked = true;
-                }
-
-                // Insert into live blocks
-                live_blocks.insert(search_key);
-
-                if (debug) CubLog("\tdevice %d allocating new device block %lld bytes associated with stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-            }
-
-            // Copy device pointer to output parameter
-            *d_ptr = search_key.d_ptr;
-
-        } while(0);
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        // Attempt to revert back to previous device if necessary
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Provides a suitable allocation of device memory for the given size on the current device.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceAllocate(
-        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
-        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
-        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        int             device,
-        void*           d_ptr)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        bool locked                     = false;
-        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
-        cudaError_t error               = cudaSuccess;
-
-        do {
-            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            if (device == INVALID_DEVICE_ORDINAL)
-                device = entrypoint_device;
-
-            // Set to specified device
-            if (device != entrypoint_device) {
-                if (CubDebug(error = cudaSetDevice(device))) break;
-            }
-
-            // Lock
-            if (!locked) {
-                Lock(&spin_lock);
-                locked = true;
-            }
-
-            // Find corresponding block descriptor
-            BlockDescriptor search_key(d_ptr, device);
-            BusyBlocks::iterator block_itr = live_blocks.find(search_key);
-            if (block_itr == live_blocks.end())
-            {
-                // Cannot find pointer
-                if (CubDebug(error = cudaErrorUnknown)) break;
-            }
-            else
-            {
-                // Remove from live blocks
-                search_key = *block_itr;
-                live_blocks.erase(block_itr);
-
-                // Check if we should keep the returned allocation
-                if (cached_bytes[device] + search_key.bytes <= max_cached_bytes)
-                {
-                    // Signal the event in the associated stream
-                    if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) break;
-
-                    // Insert returned allocation into free blocks
-                    cached_blocks.insert(search_key);
-                    cached_bytes[device] += search_key.bytes;
-
-                    if (debug) CubLog("\tdevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                        device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-                }
-                else
-                {
-                    // Free the returned allocation.  Unlock.
-                    if (locked) {
-                        Unlock(&spin_lock);
-                        locked = false;
-                    }
-
-                    // Free device memory
-                    if (CubDebug(error = cudaFree(d_ptr))) break;
-                    if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) break;
-
-                    if (debug) CubLog("\tdevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                        device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device], (long long) live_blocks.size());
-                }
-            }
-        } while (0);
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
-     *
-     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
-     * with which it was associated with during allocation, and it becomes available for reuse within other
-     * streams when all prior work submitted to \p active_stream has completed.
-     */
-    cudaError_t DeviceFree(
-        void*           d_ptr)
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Frees all cached device allocations on all devices
-     */
-    cudaError_t FreeAllCached()
-    {
-    #if (CUB_PTX_ARCH > 0)
-        // Caching functionality only defined on host
-        return CubDebug(cudaErrorInvalidConfiguration);
-    #else
-
-        cudaError_t error         = cudaSuccess;
-        bool locked               = false;
-        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
-        int current_device        = INVALID_DEVICE_ORDINAL;
-
-        // Lock
-        if (!locked) {
-            Lock(&spin_lock);
-            locked = true;
-        }
-
-        while (!cached_blocks.empty())
-        {
-            // Get first block
-            CachedBlocks::iterator begin = cached_blocks.begin();
-
-            // Get entry-point device ordinal if necessary
-            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
-            {
-                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
-            }
-
-            // Set current device ordinal if necessary
-            if (begin->device != current_device)
-            {
-                if (CubDebug(error = cudaSetDevice(begin->device))) break;
-                current_device = begin->device;
-            }
-
-            // Free device memory
-            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
-            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
-
-            // Reduce balance and erase entry
-            cached_bytes[current_device] -= begin->bytes;
-            cached_blocks.erase(begin);
-
-            if (debug) CubLog("\tdevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks outstanding.\n",
-                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device], (long long) live_blocks.size());
-        }
-
-        // Unlock
-        if (locked) {
-            Unlock(&spin_lock);
-            locked = false;
-        }
-
-        // Attempt to revert back to entry-point device if necessary
-        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
-        {
-            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
-        }
-
-        return error;
-
-    #endif // CUB_PTX_ARCH
-    }
-
-
-    /**
-     * \brief Destructor
-     */
-    virtual ~CachingDeviceAllocator()
-    {
-        if (!skip_cleanup)
-            FreeAllCached();
-    }
-
-};
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_arch.cuh b/thrust/system/cuda/detail/cub/util_arch.cuh
deleted file mode 100644
index 06988f0cc..000000000
--- a/thrust/system/cuda/detail/cub/util_arch.cuh
+++ /dev/null
@@ -1,198 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Static architectural properties by SM version.
- */
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
-#ifndef __CUDA_ARCH__
-    #define CUB_PTX_ARCH 0
-#else
-    #define CUB_PTX_ARCH __CUDA_ARCH__
-#endif
-
-/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
-#if (CUB_PTX_ARCH == 0) || defined(CUB_CDP)
-    #define CUB_RUNTIME_ENABLED
-    #define CUB_RUNTIME_FUNCTION __host__ __device__
-#else
-    #define CUB_RUNTIME_FUNCTION __host__
-#endif
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/// Number of threads per warp (log)
-#define CUB_LOG_WARP_THREADS(arch)                      \
-	(5)
-
-/// Number of threads per warp
-#define CUB_WARP_THREADS(arch)                          \
-    (1 << CUB_LOG_WARP_THREADS(arch))
-
-/// Number of smem banks (log)
-#define CUB_LOG_SMEM_BANKS(arch)                        \
-    ((arch >= 200) ?                                    \
-        (5) :                                           \
-        (4))
-
-/// Number of smem banks
-#define CUB_SMEM_BANKS(arch)                            \
-    (1 << CUB_LOG_SMEM_BANKS(arch))
-
-/// Number of bytes per smem bank
-#define CUB_SMEM_BANK_BYTES(arch)                       \
-    (4)
-
-/// Number of smem bytes provisioned per SM
-#define CUB_SMEM_BYTES(arch)                            \
-    ((arch >= 200) ?                                    \
-		(48 * 1024) :                                   \
-		(16 * 1024))
-
-/// Smem allocation size in bytes
-#define CUB_SMEM_ALLOC_UNIT(arch)                       \
-    ((arch >= 300) ?                                    \
-    	(256) :                                         \
-		((arch >= 200) ?                                \
-		    (128) :                                     \
-		    (512)))
-
-/// Whether or not the architecture allocates registers by block (or by warp)
-#define CUB_REGS_BY_BLOCK(arch)                         \
-    ((arch >= 200) ?                                    \
-    	(false) :                                       \
-    	(true))
-
-/// Number of registers allocated at a time per block (or by warp)
-#define CUB_REG_ALLOC_UNIT(arch)                        \
-    ((arch >= 300) ?                                    \
-    	(256) :                                         \
-        ((arch >= 200) ?                                \
-        	(64) :                                      \
-            ((arch >= 120) ?                            \
-            	(512) :                                 \
-            	(256))))
-
-/// Granularity of warps for which registers are allocated
-#define CUB_WARP_ALLOC_UNIT(arch)                       \
-    ((arch >= 300) ?                                    \
-        (4) :                                           \
-        (2))
-
-/// Maximum number of threads per SM
-#define CUB_MAX_SM_THREADS(arch)                        \
-    ((arch >= 300) ?                                    \
-    	(2048) :                                        \
-        ((arch >= 200) ?                                \
-        	(1536) :                                    \
-            ((arch >= 120) ?                            \
-           		(1024) :                                \
-           		(768))))
-
-/// Maximum number of thread blocks per SM
-#define CUB_MAX_SM_BLOCKS(arch)                         \
-    ((arch >= 300) ?                                    \
-        (16) :                                          \
-        (8))
-
-/// Maximum number of threads per thread block
-#define CUB_MAX_BLOCK_THREADS(arch)                     \
-    ((arch >= 200) ?                                    \
-        (1024) :                                        \
-        (512))
-
-/// Maximum number of registers per SM
-#define CUB_MAX_SM_REGISTERS(arch)                      \
-    ((arch >= 300) ?                                    \
-        (64 * 1024) :                                   \
-        ((arch >= 200) ?                                \
-            (32 * 1024) :                               \
-            ((arch >= 120) ?                            \
-                (16 * 1024) :                           \
-                (8 * 1024))))
-
-/// Oversubscription factor
-#define CUB_SUBSCRIPTION_FACTOR(arch)                   \
-    ((arch >= 300) ?                                    \
-        (5) :                                           \
-        ((arch >= 200) ?                                \
-            (3) :                                       \
-            (10)))
-
-/// Prefer padding overhead vs X-way conflicts greater than this threshold
-#define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
-    ((arch >= 300) ?                                    \
-        (1) :                                           \
-        (4))
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-#define CUB_PTX_LOG_WARP_THREADS                CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
-#define CUB_PTX_WARP_THREADS                    CUB_WARP_THREADS(CUB_PTX_ARCH)
-#define CUB_PTX_LOG_SMEM_BANKS                  CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
-#define CUB_PTX_SMEM_BANKS                      CUB_SMEM_BANKS(CUB_PTX_ARCH)
-#define CUB_PTX_SMEM_BANK_BYTES                 CUB_SMEM_BANK_BYTES(CUB_PTX_ARCH)
-#define CUB_PTX_SMEM_BYTES                      CUB_SMEM_BYTES(CUB_PTX_ARCH)
-#define CUB_PTX_SMEM_ALLOC_UNIT                 CUB_SMEM_ALLOC_UNIT(CUB_PTX_ARCH)
-#define CUB_PTX_REGS_BY_BLOCK                   CUB_REGS_BY_BLOCK(CUB_PTX_ARCH)
-#define CUB_PTX_REG_ALLOC_UNIT                  CUB_REG_ALLOC_UNIT(CUB_PTX_ARCH)
-#define CUB_PTX_WARP_ALLOC_UNIT                 CUB_WARP_ALLOC_UNIT(CUB_PTX_ARCH)
-#define CUB_PTX_MAX_SM_THREADS                  CUB_MAX_SM_THREADS(CUB_PTX_ARCH)
-#define CUB_PTX_MAX_SM_BLOCKS                   CUB_MAX_SM_BLOCKS(CUB_PTX_ARCH)
-#define CUB_PTX_MAX_BLOCK_THREADS               CUB_MAX_BLOCK_THREADS(CUB_PTX_ARCH)
-#define CUB_PTX_MAX_SM_REGISTERS                CUB_MAX_SM_REGISTERS(CUB_PTX_ARCH)
-#define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
-
-#endif  // Do not document
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_debug.cuh b/thrust/system/cuda/detail/cub/util_debug.cuh
deleted file mode 100644
index 375fd5e40..000000000
--- a/thrust/system/cuda/detail/cub/util_debug.cuh
+++ /dev/null
@@ -1,115 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Error and event logging routines.
- *
- * The following macros definitions are supported:
- * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
- */
-
-#pragma once
-
-#include <stdio.h>
-#include "util_namespace.cuh"
-#include "util_arch.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-
-/// CUB error reporting macro (prints error messages to stderr)
-#if (defined(DEBUG) || defined(_DEBUG))
-    #define CUB_STDERR
-#endif
-
-
-
-/**
- * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
- *
- * \return The CUDA error.
- */
-__host__ __device__ __forceinline__ cudaError_t Debug(
-    cudaError_t     error,
-    const char*     filename,
-    int             line)
-{
-#ifdef CUB_STDERR
-    if (error)
-    {
-    #if (CUB_PTX_ARCH == 0)
-        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
-        fflush(stderr);
-    #elif (CUB_PTX_ARCH >= 200)
-        printf("CUDA error %d [block %d, thread %d, %s, %d]\n", error, blockIdx.x, threadIdx.x, filename, line);
-    #endif
-    }
-#endif
-    return error;
-}
-
-
-/**
- * \brief Debug macro
- */
-#define CubDebug(e) cub::Debug((e), __FILE__, __LINE__)
-
-
-/**
- * \brief Debug macro with exit
- */
-#define CubDebugExit(e) if (cub::Debug((e), __FILE__, __LINE__)) { exit(1); }
-
-
-/**
- * \brief Log macro for printf statements.
- */
-#if (CUB_PTX_ARCH == 0)
-    #define CubLog(format, ...) printf(format,__VA_ARGS__);
-#elif (CUB_PTX_ARCH >= 200)
-    #define CubLog(format, ...) printf("[block %d, thread %d]: " format, blockIdx.x, threadIdx.x, __VA_ARGS__);
-#endif
-
-
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_device.cuh b/thrust/system/cuda/detail/cub/util_device.cuh
deleted file mode 100644
index 332ced5ce..000000000
--- a/thrust/system/cuda/detail/cub/util_device.cuh
+++ /dev/null
@@ -1,372 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Properties of a given CUDA device and the corresponding PTX bundle
- */
-
-#pragma once
-
-#include "util_arch.cuh"
-#include "util_debug.cuh"
-#include "util_namespace.cuh"
-#include "util_macro.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilMgmt
- * @{
- */
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
- */
-template <typename T>
-__global__ void EmptyKernel(void) { }
-
-
-/**
- * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
- */
-template <int ALLOCATIONS>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t AliasTemporaries(
-    void    *d_temp_storage,                    ///< [in] %Device allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
-    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
-    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
-    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
-{
-    const int ALIGN_BYTES   = 256;
-    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
-
-    // Compute exclusive prefix sum over allocation requests
-    size_t allocation_offsets[ALLOCATIONS];
-    size_t bytes_needed = 0;
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-        allocation_offsets[i] = bytes_needed;
-        bytes_needed += allocation_bytes;
-    }
-
-    // Check if the caller is simply requesting the size of the storage allocation
-    if (!d_temp_storage)
-    {
-        temp_storage_bytes = bytes_needed;
-        return cudaSuccess;
-    }
-
-    // Check if enough storage provided
-    if (temp_storage_bytes < bytes_needed)
-    {
-        return CubDebug(cudaErrorInvalidValue);
-    }
-
-    // Alias
-    for (int i = 0; i < ALLOCATIONS; ++i)
-    {
-        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
-    }
-
-    return cudaSuccess;
-}
-
-
-
-#endif  // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/**
- * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
-{
-    struct Dummy
-    {
-        /// Type definition of the EmptyKernel kernel entry point
-        typedef void (*EmptyKernelPtr)();
-
-        /// Force EmptyKernel<void> to be generated if this class is used
-        CUB_RUNTIME_FUNCTION __forceinline__
-        EmptyKernelPtr Empty()
-        {
-            return EmptyKernel<void>;
-        }
-    };
-
-
-#ifndef CUB_RUNTIME_ENABLED
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#elif (CUB_PTX_ARCH > 0)
-
-    ptx_version = CUB_PTX_ARCH;
-    return cudaSuccess;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        cudaFuncAttributes empty_kernel_attrs;
-        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
-        ptx_version = empty_kernel_attrs.ptxVersion * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-/**
- * \brief Retrieves the SM version (major * 100 + minor * 10)
- */
-CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
-{
-#ifndef CUB_RUNTIME_ENABLED
-
-    // CUDA API calls not supported from this device
-    return cudaErrorInvalidConfiguration;
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        // Fill in SM version
-        int major, minor;
-        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
-        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
-        sm_version = major * 100 + minor * 10;
-    }
-    while (0);
-
-    return error;
-
-#endif
-}
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Synchronize the stream if specified
- */
-CUB_RUNTIME_FUNCTION __forceinline__
-static cudaError_t SyncStream(cudaStream_t stream)
-{
-#if (CUB_PTX_ARCH == 0)
-    return cudaStreamSynchronize(stream);
-#else
-    // Device can't yet sync on a specific stream
-    return cudaDeviceSynchronize();
-#endif
-}
-
-
-/**
- * \brief Computes maximum SM occupancy in thread blocks for the given kernel function pointer \p kernel_ptr.
- */
-template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t MaxSmOccupancy(
-    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    int                 sm_version,                 ///< [in] The SM architecture to run on
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads)              ///< [in] Number of threads per thread block
-{
-#ifndef CUB_RUNTIME_ENABLED
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        int warp_threads        = 1 << CUB_LOG_WARP_THREADS(sm_version);
-        int max_sm_blocks       = CUB_MAX_SM_BLOCKS(sm_version);
-        int max_sm_warps        = CUB_MAX_SM_THREADS(sm_version) / warp_threads;
-        int regs_by_block       = CUB_REGS_BY_BLOCK(sm_version);
-        int max_sm_registers    = CUB_MAX_SM_REGISTERS(sm_version);
-        int warp_alloc_unit     = CUB_WARP_ALLOC_UNIT(sm_version);
-        int smem_alloc_unit     = CUB_SMEM_ALLOC_UNIT(sm_version);
-        int reg_alloc_unit      = CUB_REG_ALLOC_UNIT(sm_version);
-        int smem_bytes          = CUB_SMEM_BYTES(sm_version);
-
-        // Get kernel attributes
-        cudaFuncAttributes kernel_attrs;
-        if (CubDebug(error = cudaFuncGetAttributes(&kernel_attrs, kernel_ptr))) break;
-
-        // Number of warps per threadblock
-        int block_warps = (block_threads +  warp_threads - 1) / warp_threads;
-
-        // Max warp occupancy
-        int max_warp_occupancy = (block_warps > 0) ?
-            max_sm_warps / block_warps :
-            max_sm_blocks;
-
-        // Maximum register occupancy
-        int max_reg_occupancy;
-        if ((block_threads == 0) || (kernel_attrs.numRegs == 0))
-        {
-            // Prevent divide-by-zero
-            max_reg_occupancy = max_sm_blocks;
-        }
-        else if (regs_by_block)
-        {
-            // Allocates registers by threadblock
-            int block_regs = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads * block_warps, reg_alloc_unit);
-            max_reg_occupancy = max_sm_registers / block_regs;
-        }
-        else
-        {
-            // Allocates registers by warp
-            int sm_sides                = warp_alloc_unit;
-            int sm_registers_per_side   = max_sm_registers / sm_sides;
-            int regs_per_warp           = CUB_ROUND_UP_NEAREST(kernel_attrs.numRegs * warp_threads, reg_alloc_unit);
-            int warps_per_side          = sm_registers_per_side / regs_per_warp;
-            int warps                   = warps_per_side * sm_sides;
-            max_reg_occupancy           = warps / block_warps;
-        }
-
-        // Shared memory per threadblock
-        int block_allocated_smem = CUB_ROUND_UP_NEAREST(
-            (int) kernel_attrs.sharedSizeBytes,
-            smem_alloc_unit);
-
-        // Max shared memory occupancy
-        int max_smem_occupancy = (block_allocated_smem > 0) ?
-            (smem_bytes / block_allocated_smem) :
-            max_sm_blocks;
-
-        // Max occupancy
-        max_sm_occupancy = CUB_MIN(
-            CUB_MIN(max_sm_blocks, max_warp_occupancy),
-            CUB_MIN(max_smem_occupancy, max_reg_occupancy));
-
-//            printf("max_smem_occupancy(%d), max_warp_occupancy(%d), max_reg_occupancy(%d) \n", max_smem_occupancy, max_warp_occupancy, max_reg_occupancy);
-
-    } while (0);
-
-    return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-}
-
-#endif  // Do not document
-
-
-/**
- * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
- *
- * \par Snippet
- * The code snippet below illustrates the use of the MaxSmOccupancy function.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
- *
- * template <typename T>
- * __global__ void ExampleKernel()
- * {
- *     // Allocate shared memory for BlockScan
- *     __shared__ volatile T buffer[4096];
- *
- *        ...
- * }
- *
- *     ...
- *
- * // Determine SM occupancy for ExampleKernel specialized for unsigned char
- * int max_sm_occupancy;
- * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
- *
- * // max_sm_occupancy  <-- 4 on SM10
- * // max_sm_occupancy  <-- 8 on SM20
- * // max_sm_occupancy  <-- 12 on SM35
- *
- * \endcode
- *
- */
-template <typename KernelPtr>
-CUB_RUNTIME_FUNCTION __forceinline__
-cudaError_t MaxSmOccupancy(
-    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
-    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
-    int                 block_threads)              ///< [in] Number of threads per thread block
-{
-#ifndef CUB_RUNTIME_ENABLED
-
-    // CUDA API calls not supported from this device
-    return CubDebug(cudaErrorInvalidConfiguration);
-
-#else
-
-    cudaError_t error = cudaSuccess;
-    do
-    {
-        // Get device ordinal
-        int device_ordinal;
-        if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
-
-        // Get device SM version
-        int sm_version;
-        if (CubDebug(error = SmVersion(sm_version, device_ordinal))) break;
-
-        // Get SM occupancy
-        if (CubDebug(error = MaxSmOccupancy(max_sm_occupancy, sm_version, kernel_ptr, block_threads))) break;
-
-    } while (0);
-
-    return error;
-
-#endif  // CUB_RUNTIME_ENABLED
-
-}
-
-
-/** @} */       // end group UtilMgmt
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_macro.cuh b/thrust/system/cuda/detail/cub/util_macro.cuh
deleted file mode 100644
index a94031a4c..000000000
--- a/thrust/system/cuda/detail/cub/util_macro.cuh
+++ /dev/null
@@ -1,107 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/******************************************************************************
- * Common C/C++ macro utilities
- ******************************************************************************/
-
-#pragma once
-
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-/**
- * Align struct
- */
-#if defined(_WIN32) || defined(_WIN64)
-    #define CUB_ALIGN(bytes) __declspec(align(32))
-#else
-    #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
-#endif
-
-/**
- * Select maximum(a, b)
- */
-#define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
-
-/**
- * Select minimum(a, b)
- */
-#define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
-
-/**
- * Quotient of x/y rounded down to nearest integer
- */
-#define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
-
-/**
- * Quotient of x/y rounded up to nearest integer
- */
-#define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
-
-/**
- * x rounded up to the nearest multiple of y
- */
-#define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
-
-/**
- * x rounded down to the nearest multiple of y
- */
-#define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
-
-/**
- * Return character string for given type
- */
-#define CUB_TYPE_STRING(type) ""#type
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-    #define CUB_CAT_(a, b) a ## b
-    #define CUB_CAT(a, b) CUB_CAT_(a, b)
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * Static assert
- */
-#define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_ptx.cuh b/thrust/system/cuda/detail/cub/util_ptx.cuh
deleted file mode 100644
index 4172de2ad..000000000
--- a/thrust/system/cuda/detail/cub/util_ptx.cuh
+++ /dev/null
@@ -1,606 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * PTX intrinsics
- */
-
-
-#pragma once
-
-#include "util_type.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilPtx
- * @{
- */
-
-
-/******************************************************************************
- * PTX helper macros
- ******************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Register modifier for pointer-types (for inlining PTX assembly)
- */
-#if defined(_WIN64) || defined(__LP64__)
-    #define __CUB_LP64__ 1
-    // 64-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "l"
-    #define _CUB_ASM_PTR_SIZE_ "u64"
-#else
-    #define __CUB_LP64__ 0
-    // 32-bit register modifier for inlined asm
-    #define _CUB_ASM_PTR_ "r"
-    #define _CUB_ASM_PTR_SIZE_ "u32"
-#endif
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Inlined PTX intrinsics
- ******************************************************************************/
-
-/**
- * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHR_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x >> shift) + addend;
-#endif
-    return ret;
-}
-
-
-/**
- * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
- */
-__device__ __forceinline__ unsigned int SHL_ADD(
-    unsigned int x,
-    unsigned int shift,
-    unsigned int addend)
-{
-    unsigned int ret;
-#if CUB_PTX_ARCH >= 200
-    asm("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
-        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
-#else
-    ret = (x << shift) + addend;
-#endif
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Bitfield-extract.
- */
-template <typename UnsignedBits, int BYTE_LEN>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<BYTE_LEN>      byte_len)
-{
-    unsigned int bits;
-#if CUB_PTX_ARCH >= 200
-    asm("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
-#else
-    const unsigned int MASK = (1 << num_bits) - 1;
-    bits = (source >> bit_start) & MASK;
-#endif
-    return bits;
-}
-
-
-/**
- * Bitfield-extract for 64-bit types.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits            source,
-    unsigned int            bit_start,
-    unsigned int            num_bits,
-    Int2Type<8>             byte_len)
-{
-    const unsigned long long MASK = (1ull << num_bits) - 1;
-    return (source >> bit_start) & MASK;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
- */
-template <typename UnsignedBits>
-__device__ __forceinline__ unsigned int BFE(
-    UnsignedBits source,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
-}
-
-
-/**
- * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
- */
-__device__ __forceinline__ void BFI(
-    unsigned int &ret,
-    unsigned int x,
-    unsigned int y,
-    unsigned int bit_start,
-    unsigned int num_bits)
-{
-#if CUB_PTX_ARCH >= 200
-    asm("bfi.b32 %0, %1, %2, %3, %4;" :
-        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
-#else
-    x <<= bit_start;
-    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
-    unsigned int MASK_Y = ~MASK_X;
-    ret = (y & MASK_Y) | (x & MASK_X);
-#endif
-}
-
-
-/**
- * \brief Three-operand add.  Returns \p x + \p y + \p z.
- */
-__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
-{
-#if CUB_PTX_ARCH >= 200
-    asm("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
-#else
-    x = x + y + z;
-#endif
-    return x;
-}
-
-
-/**
- * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
- *
- * \par
- * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
- * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
- * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
- * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
- *
- * \par Snippet
- * The code snippet below illustrates byte-permute.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     int a        = 0x03020100;
- *     int b        = 0x07060504;
- *     int index    = 0x00007531;
- *
- *     int selected = PRMT(a, b, index);    // 0x07050301
- *
- * \endcode
- *
- */
-__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
-{
-    int ret;
-    asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
-    return ret;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Sync-threads barrier.
- */
-__device__ __forceinline__ void BAR(int count)
-{
-    asm volatile("bar.sync 1, %0;" : : "r"(count));
-}
-
-
-/**
- * Floating point multiply. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FMUL_RZ(float a, float b)
-{
-    float d;
-    asm("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
-    return d;
-}
-
-
-/**
- * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
- */
-__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
-{
-    float d;
-    asm("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
-    return d;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Terminates the calling thread
- */
-__device__ __forceinline__ void ThreadExit() {
-    asm("exit;");
-}    
-
-
-/**
- * \brief Returns the row-major linear thread identifier for a multidimensional threadblock
- */
-__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
-{
-    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
-            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
-            threadIdx.x;
-}
-
-
-/**
- * \brief Returns the warp lane ID of the calling thread
- */
-__device__ __forceinline__ unsigned int LaneId()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %laneid;" : "=r"(ret) );
-    return ret;
-}
-
-
-/**
- * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
- */
-__device__ __forceinline__ unsigned int WarpId()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %warpid;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLt()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskLe()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_le;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGt()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_gt;" : "=r"(ret) );
-    return ret;
-}
-
-/**
- * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
- */
-__device__ __forceinline__ unsigned int LaneMaskGe()
-{
-    unsigned int ret;
-    asm("mov.u32 %0, %lanemask_ge;" : "=r"(ret) );
-    return ret;
-}
-
-/** @} */       // end group UtilPtx
-
-
-
-
-/**
- * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * predecessor of its predecessor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleUp(thread_data, 2);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleUp(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset)         ///< [in] The relative down-offset of the peer to read from
-{
-    enum
-    {
-        SHFL_C = 0,
-    };
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    #pragma unroll
-    for (int WORD = 0; WORD < WORDS; ++WORD)
-    {
-        unsigned int shuffle_word = input_alias[WORD];
-        asm(
-            "  shfl.up.b32 %0, %1, %2, %3;"
-            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
-        output_alias[WORD] = (ShuffleWord) shuffle_word;
-    }
-
-    return output;
-}
-
-
-/**
- * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from the
- * successor of its successor.
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from two ranks below
- *     double peer_data = ShuffleDown(thread_data, 2);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleDown(
-    T               input,              ///< [in] The value to broadcast
-    int             src_offset)         ///< [in] The relative up-offset of the peer to read from
-{
-    enum
-    {
-        SHFL_C = CUB_PTX_WARP_THREADS - 1,
-    };
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    #pragma unroll
-    for (int WORD = 0; WORD < WORDS; ++WORD)
-    {
-        unsigned int shuffle_word = input_alias[WORD];
-        asm(
-            "  shfl.down.b32 %0, %1, %2, %3;"
-            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_offset), "r"(SHFL_C));
-        output_alias[WORD] = (ShuffleWord) shuffle_word;
-    }
-
-    return output;
-}
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread.  ![](shfl_broadcast_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleBroadcast(
-    T               input,                                          ///< [in] The value to broadcast
-    int             src_lane,                                       ///< [in] Which warp lane is to do the broadcasting
-    int             logical_warp_threads)                           ///< [in] Number of threads per logical warp
-{
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-
-    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
-    T               output;
-    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
-    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
-
-    #pragma unroll
-    for (int WORD = 0; WORD < WORDS; ++WORD)
-    {
-        unsigned int shuffle_word = input_alias[WORD];
-        asm("shfl.idx.b32 %0, %1, %2, %3;"
-            : "=r"(shuffle_word) : "r"(shuffle_word), "r"(src_lane), "r"(logical_warp_threads - 1));
-        output_alias[WORD] = (ShuffleWord) shuffle_word;
-    }
-
-    return output;
-}
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
- /**
- * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS, then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
- * \ingroup WarpModule
- *
- * \par
- * - Available only for SM3.0 or newer
- *
- * \par Snippet
- * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
- *
- * \par
- * \code
- * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Obtain one input item per thread
- *     double thread_data = ...
- *
- *     // Obtain item from thread 0
- *     double peer_data = ShuffleBroadcast(thread_data, 0);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
- * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
- *
- */
-template <typename T>
-__device__ __forceinline__ T ShuffleBroadcast(
-    T               input,              ///< [in] The value to broadcast
-    int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-{
-    return ShuffleBroadcast(input, src_lane, CUB_PTX_WARP_THREADS);
-}
-
-
-
-
-
-/**
- * \brief Portable implementation of __all
- * \ingroup WarpModule
- */
-__device__ __forceinline__ int WarpAll(int cond)
-{
-#if CUB_PTX_ARCH < 120
-
-    __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
-
-    if (LaneId() == 0)
-        warp_signals[WarpId()] = 1;
-
-    if (cond == 0)
-        warp_signals[WarpId()] = 0;
-
-    return warp_signals[WarpId()];
-
-#else
-
-    return __all(cond);
-
-#endif
-}
-
-
-/**
- * \brief Portable implementation of __any
- * \ingroup WarpModule
- */
-__device__ __forceinline__ int WarpAny(int cond)
-{
-#if CUB_PTX_ARCH < 120
-
-    __shared__ volatile int warp_signals[CUB_PTX_MAX_SM_THREADS / CUB_PTX_WARP_THREADS];
-
-    if (LaneId() == 0)
-        warp_signals[WarpId()] = 0;
-
-    if (cond)
-        warp_signals[WarpId()] = 1;
-
-    return warp_signals[WarpId()];
-
-#else
-
-    return __any(cond);
-
-#endif
-}
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/util_type.cuh b/thrust/system/cuda/detail/cub/util_type.cuh
deleted file mode 100644
index 242a1a178..000000000
--- a/thrust/system/cuda/detail/cub/util_type.cuh
+++ /dev/null
@@ -1,1028 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * Common type manipulation (metaprogramming) utilities
- */
-
-#pragma once
-
-#include <iostream>
-#include <limits>
-
-#include "util_macro.cuh"
-#include "util_arch.cuh"
-#include "util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup UtilModule
- * @{
- */
-
-
-
-/******************************************************************************
- * Type equality
- ******************************************************************************/
-
-/**
- * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
- */
-template <bool IF, typename ThenType, typename ElseType>
-struct If
-{
-    /// Conditional type result
-    typedef ThenType Type;      // true
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename ThenType, typename ElseType>
-struct If<false, ThenType, ElseType>
-{
-    typedef ElseType Type;      // false
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Conditional types
- ******************************************************************************/
-
-/**
- * \brief Type equality test
- */
-template <typename A, typename B>
-struct Equals
-{
-    enum {
-        VALUE = 0,
-        NEGATE = 1
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename A>
-struct Equals <A, A>
-{
-    enum {
-        VALUE = 1,
-        NEGATE = 0
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Marker types
- ******************************************************************************/
-
-/**
- * \brief A simple "NULL" marker type
- */
-struct NullType
-{
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    template <typename T>
-    __host__ __device__ __forceinline__ NullType& operator =(const T& b) { return *this; }
-
-    __host__ __device__ __forceinline__ bool operator ==(const NullType& b) { return true; }
-
-    __host__ __device__ __forceinline__ bool operator !=(const NullType& b) { return false; }
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-};
-
-
-/**
- * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
- */
-template <int A>
-struct Int2Type
-{
-   enum {VALUE = A};
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/******************************************************************************
- * Size and alignment
- ******************************************************************************/
-
-/// Structure alignment
-template <typename T>
-struct AlignBytes
-{
-    struct Pad
-    {
-        T       val;
-        char    byte;
-    };
-
-    enum
-    {
-        /// The alignment of T in bytes
-        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
-    };
-};
-
-// Specializations where host C++ compilers (e.g., Windows) may disagree with device C++ compilers (EDG)
-
-template <> struct AlignBytes<short4>               { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<ushort4>              { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<int2>                 { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<uint2>                { enum { ALIGN_BYTES = 8 }; };
-#ifdef _WIN32
-    template <> struct AlignBytes<long2>            { enum { ALIGN_BYTES = 8 }; };
-    template <> struct AlignBytes<ulong2>           { enum { ALIGN_BYTES = 8 }; };
-#endif
-template <> struct AlignBytes<long long>            { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<unsigned long long>   { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<float2>               { enum { ALIGN_BYTES = 8 }; };
-template <> struct AlignBytes<double>               { enum { ALIGN_BYTES = 8 }; };
-
-template <> struct AlignBytes<int4>                 { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<uint4>                { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<float4>               { enum { ALIGN_BYTES = 16 }; };
-#ifndef _WIN32
-    template <> struct AlignBytes<long2>            { enum { ALIGN_BYTES = 16 }; };
-    template <> struct AlignBytes<ulong2>           { enum { ALIGN_BYTES = 16 }; };
-#endif
-template <> struct AlignBytes<long4>                { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulong4>               { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<longlong2>            { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulonglong2>           { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<double2>              { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<longlong4>            { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<ulonglong4>           { enum { ALIGN_BYTES = 16 }; };
-template <> struct AlignBytes<double4>              { enum { ALIGN_BYTES = 16 }; };
-
-
-/// Unit-words of data movement
-template <typename T>
-struct UnitWord
-{
-    enum {
-        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
-    };
-
-    template <typename Unit>
-    struct IsMultiple
-    {
-        enum {
-            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
-            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
-        };
-    };
-
-    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
-        unsigned int,
-        typename If<IsMultiple<short>::IS_MULTIPLE,
-            unsigned short,
-            unsigned char>::Type>::Type         ShuffleWord;
-
-    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
-        unsigned long long,
-        ShuffleWord>::Type                      VolatileWord;
-
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
-        ulonglong2,
-        VolatileWord>::Type                     DeviceWord;
-
-    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
-        uint4,
-        typename If<IsMultiple<int2>::IS_MULTIPLE,
-            uint2,
-            ShuffleWord>::Type>::Type           TextureWord;
-};
-
-
-// float2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float2>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float       VolatileWord;
-    typedef uint2       DeviceWord;
-#else
-    typedef unsigned long long   VolatileWord;
-    typedef unsigned long long   DeviceWord;
-#endif
-    typedef float2      TextureWord;
-};
-
-// float4 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <float4>
-{
-    typedef int         ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef float               VolatileWord;
-    typedef uint4               DeviceWord;
-#else
-    typedef unsigned long long  VolatileWord;
-    typedef ulonglong2          DeviceWord;
-#endif
-    typedef float4              TextureWord;
-};
-
-
-// char2 specialization workaround (for SM10-SM13)
-template <>
-struct UnitWord <char2>
-{
-    typedef unsigned short      ShuffleWord;
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-    typedef unsigned short      VolatileWord;
-    typedef short               DeviceWord;
-#else
-    typedef unsigned short      VolatileWord;
-    typedef unsigned short      DeviceWord;
-#endif
-    typedef unsigned short      TextureWord;
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Vector type inference utilities.
- ******************************************************************************/
-
-/**
- * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
- */
-template <typename T, int vec_elements> struct CubVector;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-enum
-{
-    /// The maximum number of elements in CUDA vector types
-    MAX_VEC_ELEMENTS = 4,
-};
-
-
-/**
- * Generic vector-1 type
- */
-template <typename T>
-struct CubVector<T, 1>
-{
-    T x;
-
-    typedef T BaseType;
-    typedef CubVector<T, 1> Type;
-};
-
-/**
- * Generic vector-2 type
- */
-template <typename T>
-struct CubVector<T, 2>
-{
-    T x;
-    T y;
-
-    typedef T BaseType;
-    typedef CubVector<T, 2> Type;
-};
-
-/**
- * Generic vector-3 type
- */
-template <typename T>
-struct CubVector<T, 3>
-{
-    T x;
-    T y;
-    T z;
-
-    typedef T BaseType;
-    typedef CubVector<T, 3> Type;
-};
-
-/**
- * Generic vector-4 type
- */
-template <typename T>
-struct CubVector<T, 4>
-{
-    T x;
-    T y;
-    T z;
-    T w;
-
-    typedef T BaseType;
-    typedef CubVector<T, 4> Type;
-};
-
-
-/**
- * Macro for expanding partially-specialized built-in vector types
- */
-#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
-                                                                                                        \
-    template<> struct CubVector<base_type, 1> : short_type##1                                           \
-    {                                                                                                   \
-      typedef base_type       BaseType;                                                                 \
-      typedef short_type##1   Type;                                                                     \
-      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x + other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
-          CubVector retval;                                                                             \
-          retval.x = x - other.x;                                                                       \
-          return retval;                                                                                \
-      }                                                                                                 \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 2> : short_type##2                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##2   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 3> : short_type##3                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##3   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };                                                                                                  \
-                                                                                                        \
-    template<> struct CubVector<base_type, 4> : short_type##4                                           \
-    {                                                                                                   \
-        typedef base_type       BaseType;                                                               \
-        typedef short_type##4   Type;                                                                   \
-        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x + other.x;                                                                     \
-            retval.y = y + other.y;                                                                     \
-            retval.z = z + other.z;                                                                     \
-            retval.w = w + other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
-            CubVector retval;                                                                           \
-            retval.x = x - other.x;                                                                     \
-            retval.y = y - other.y;                                                                     \
-            retval.z = z - other.z;                                                                     \
-            retval.w = w - other.w;                                                                     \
-            return retval;                                                                              \
-        }                                                                                               \
-    };
-
-
-
-// Expand CUDA vector types for built-in primitives
-CUB_DEFINE_VECTOR_TYPE(char,               char)
-CUB_DEFINE_VECTOR_TYPE(signed char,        char)
-CUB_DEFINE_VECTOR_TYPE(short,              short)
-CUB_DEFINE_VECTOR_TYPE(int,                int)
-CUB_DEFINE_VECTOR_TYPE(long,               long)
-CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
-CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
-CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
-CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
-CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
-CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
-CUB_DEFINE_VECTOR_TYPE(float,              float)
-CUB_DEFINE_VECTOR_TYPE(double,             double)
-CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
-
-// Undefine macros
-#undef CUB_DEFINE_VECTOR_TYPE
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Wrapper types
- ******************************************************************************/
-
-/**
- * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
- */
-template <typename T>
-struct Uninitialized
-{
-    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
-    typedef typename UnitWord<T>::DeviceWord DeviceWord;
-
-    enum
-    {
-        WORDS = sizeof(T) / sizeof(DeviceWord)
-    };
-
-    /// Backing storage
-    DeviceWord storage[WORDS];
-
-    /// Alias
-    __host__ __device__ __forceinline__ T& Alias()
-    {
-        return reinterpret_cast<T&>(*this);
-    }
-};
-
-
-/**
- * \brief An item value paired with a corresponding offset
- */
-template <typename _T, typename _Offset>
-struct ItemOffsetPair
-{
-    typedef _T        T;                ///< Item data type
-    typedef _Offset   Offset;           ///< Integer offset data type
-
-#if (CUB_PTX_ARCH == 0)
-    union
-    {
-        Offset                              offset;     ///< Offset
-        typename UnitWord<T>::DeviceWord    align0;     ///< Alignment/padding (for Win32 consistency between host/device)
-    };
-#else
-    Offset                                  offset;     ///< Offset
-#endif
-
-    T                                       value;      ///< Item value
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const ItemOffsetPair &b)
-    {
-        return (value != b.value) || (offset != b.offset);
-    }
-};
-
-
-/**
- * \brief A key identifier paired with a corresponding value
- */
-template <typename _Key, typename _Value>
-struct KeyValuePair
-{
-    typedef _Key    Key;                ///< Key data type
-    typedef _Value  Value;              ///< Value data type
-
-    Value                   value;      ///< Item value
-    Key                     key;        ///< Item key
-
-    /// Inequality operator
-    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
-    {
-        return (value != b.value) || (key != b.key);
-    }
-
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/**
- * Workaround for inability for SM1.x compiler to properly zero-initialize POD structures when it's supposed to
- */
-template <typename T>
-__host__ __device__ __forceinline__ T ZeroInitialize()
-{
-#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
-
-    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
-    const int MULTIPLE = sizeof(T) / sizeof(ShuffleWord);
-    ShuffleWord words[MULTIPLE];
-    #pragma unroll
-    for (int i = 0; i < MULTIPLE; ++i)
-        words[i] = 0;
-    return *reinterpret_cast<T*>(words);
-
-#else
-
-    return T();
-
-#endif
-}
-
-
-/**
- * \brief A wrapper for passing simple static arrays as kernel parameters
- */
-template <typename T, int COUNT>
-struct ArrayWrapper
-{
-    /// Static array of type \p T
-    T array[COUNT];
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/**
- * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
- *
- * Many multi-pass computations require a pair of "ping-pong" storage
- * buffers (e.g., one for reading from and the other for writing to, and then
- * vice-versa for the subsequent pass).  This structure wraps a set of device
- * buffers and a "selector" member to track which is "current".
- */
-template <typename T>
-struct DoubleBuffer
-{
-    /// Pair of device buffer pointers
-    T *d_buffers[2];
-
-    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
-    int selector;
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer()
-    {
-        selector = 0;
-        d_buffers[0] = NULL;
-        d_buffers[1] = NULL;
-    }
-
-    /// \brief Constructor
-    __host__ __device__ __forceinline__ DoubleBuffer(
-        T *d_current,         ///< The currently valid buffer
-        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
-    {
-        selector = 0;
-        d_buffers[0] = d_current;
-        d_buffers[1] = d_alternate;
-    }
-
-    /// \brief Return pointer to the currently valid buffer
-    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
-};
-
-
-
-/******************************************************************************
- * Static math
- ******************************************************************************/
-
-/**
- * \brief Statically determine log2(N), rounded up.
- *
- * For example:
- *     Log2<8>::VALUE   // 3
- *     Log2<3>::VALUE   // 2
- */
-template <int N, int CURRENT_VAL = N, int COUNT = 0>
-struct Log2
-{
-    /// Static logarithm value
-    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-template <int N, int COUNT>
-struct Log2<N, 0, COUNT>
-{
-    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
-        COUNT :
-        COUNT - 1 };
-};
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Statically determine if N is a power-of-two
- */
-template <int N>
-struct PowerOfTwo
-{
-    enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-
-/******************************************************************************
- * Pointer vs. iterator detection
- ******************************************************************************/
-
-/**
- * \brief Pointer vs. iterator
- */
-template <typename Tp>
-struct IsPointer
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsPointer<Tp*>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Qualifier detection
- ******************************************************************************/
-
-/**
- * \brief Volatile modifier test
- */
-template <typename Tp>
-struct IsVolatile
-{
-    enum { VALUE = 0 };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp>
-struct IsVolatile<Tp volatile>
-{
-    enum { VALUE = 1 };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Qualifier removal
- ******************************************************************************/
-
-/**
- * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
- *
- * For example:
- *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
- */
-template <typename Tp, typename Up = Tp>
-struct RemoveQualifiers
-{
-    /// Type without \p const and \p volatile qualifiers
-    typedef Up Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, volatile Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const Up>
-{
-    typedef Up Type;
-};
-
-template <typename Tp, typename Up>
-struct RemoveQualifiers<Tp, const volatile Up>
-{
-    typedef Up Type;
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-
-/**
- * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
- */
-#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
-    template <typename T>                                               \
-    struct detector_name                                                \
-    {                                                                   \
-        template <typename C>                                           \
-        static char& test(typename C::nested_type_name*);               \
-        template <typename>                                             \
-        static int& test(...);                                          \
-        enum                                                            \
-        {                                                               \
-            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
-        };                                                              \
-    };
-
-
-
-/******************************************************************************
- * Simple enable-if (similar to Boost)
- ******************************************************************************/
-
-/**
- * \brief Simple enable-if (similar to Boost)
- */
-template <bool Condition, class T = void>
-struct EnableIf
-{
-    /// Enable-if type for SFINAE dummy variables
-    typedef T Type;
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <class T>
-struct EnableIf<false, T> {};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/******************************************************************************
- * Typedef-detection
- ******************************************************************************/
-
-/**
- * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
- */
-template <typename T, typename BinaryOp>
-struct BinaryOpHasIdxParam
-{
-private:
-/*
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
-*/
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
-    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
-/*
-    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
-*/
-    template <typename BinaryOpT> static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
-    template <typename BinaryOpT> static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
-
-    template <typename BinaryOpT> static int Test(...);
-
-public:
-
-    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
-    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/******************************************************************************
- * Simple type traits utilities.
- *
- * For example:
- *     Traits<int>::CATEGORY             // SIGNED_INTEGER
- *     Traits<NullType>::NULL_TYPE       // true
- *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
- *     Traits<uint4>::PRIMITIVE;         // false
- *
- ******************************************************************************/
-
-/**
- * \brief Basic type traits categories
- */
-enum Category
-{
-    NOT_A_NUMBER,
-    SIGNED_INTEGER,
-    UNSIGNED_INTEGER,
-    FLOATING_POINT
-};
-
-
-/**
- * \brief Basic type traits
- */
-template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits>
-struct BaseTraits
-{
-    /// Category
-    static const Category CATEGORY      = _CATEGORY;
-    enum
-    {
-        PRIMITIVE       = _PRIMITIVE,
-        NULL_TYPE       = _NULL_TYPE,
-    };
-};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-/**
- * Basic type traits (unsigned primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = UNSIGNED_INTEGER;
-    static const UnsignedBits   MIN_KEY     = UnsignedBits(0);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key;
-    }
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key;
-    }
-};
-
-
-/**
- * Basic type traits (signed primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = SIGNED_INTEGER;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   MIN_KEY     = HIGH_BIT;
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        return key ^ HIGH_BIT;
-    };
-
-};
-
-
-/**
- * Basic type traits (fp primitive specialization)
- */
-template <typename _UnsignedBits>
-struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits>
-{
-    typedef _UnsignedBits       UnsignedBits;
-
-    static const Category       CATEGORY    = FLOATING_POINT;
-    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
-    static const UnsignedBits   MIN_KEY     = UnsignedBits(-1);
-    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
-
-    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
-        return key ^ mask;
-    };
-
-    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
-    {
-        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
-        return key ^ mask;
-    };
-
-    enum
-    {
-        PRIMITIVE       = true,
-        NULL_TYPE       = false,
-    };
-};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Numeric type traits
- */
-template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T> {};
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType> {};
-
-template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short> {};
-template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int> {};
-template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long> {};
-template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long> {};
-
-template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char> {};
-template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short> {};
-template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int> {};
-template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long> {};
-template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long> {};
-
-template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int> {};
-template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long> {};
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-/**
- * \brief Type traits
- */
-template <typename T>
-struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
-
-
-
-/** @} */       // end group UtilModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
deleted file mode 100644
index 235923181..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh
+++ /dev/null
@@ -1,420 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_type.cuh"
-#include "../../util_macro.cuh"
-#include "../../util_namespace.cuh"
-#include "../../util_debug.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceShfl
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp reduction steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// Number of logical warps in a PTX warp
-        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
-    };
-
-    template <typename S>
-    struct IsInteger
-    {
-        enum {
-            /// Whether the data type is a primitive integer
-            IS_INTEGER = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) || (Traits<S>::CATEGORY == SIGNED_INTEGER),
-
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_INTEGER = IS_INTEGER && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP, int WARPS>
-    struct LastLaneMask
-    {
-        enum {
-            BASE_MASK   = 1 << (LOGICAL_WARP_THREADS - 1),
-            MASK        = (LastLaneMask<WARP + 1, WARPS>::MASK << LOGICAL_WARP_THREADS) | BASE_MASK,
-        };
-    };
-
-    // Creates a mask where the last thread in each logical warp is set
-    template <int WARP>
-    struct LastLaneMask<WARP, WARP>
-    {
-        enum {
-            MASK        = 1 << (LOGICAL_WARP_THREADS - 1),
-        };
-    };
-
-
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    int lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceShfl(
-        TempStorage &temp_storage)
-    :
-        lane_id(LaneId())
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Reduction (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int ReduceStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(last_lane), "r"(input));
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across fp32 types)
-    __device__ __forceinline__ float ReduceStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.down.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(last_lane), "f"(input));
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long ReduceStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across long long types)
-    __device__ __forceinline__ long long ReduceStep(
-        long long           input,              ///< [in] Calling thread's input item.
-        cub::Sum            reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(last_lane));
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for summation across double types)
-    __device__ __forceinline__ double ReduceStep(
-        double              input,              ///< [in] Calling thread's input item.
-        cub::Sum            reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.down.b32 lo|p, lo, %2, %3;"
-            "  shfl.down.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.f64 %0, %0, %1;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(last_lane));
-
-        return output;
-    }
-
-
-    /// Reduction (specialized for ReduceBySegmentOp<cub::Sum> across ItemOffsetPair<Value, Offset> types)
-    template <typename Value, typename Offset>
-    __device__ __forceinline__ ItemOffsetPair<Value, Offset> ReduceStep(
-        ItemOffsetPair<Value, Offset>                                   input,              ///< [in] Calling thread's input item.
-        ReduceBySegmentOp<cub::Sum, ItemOffsetPair<Value, Offset> >     reduction_op,       ///< [in] Binary reduction operator
-        int                                                             last_lane,          ///< [in] Index of last lane in segment
-        int                                                             offset)             ///< [in] Up-offset to pull from
-    {
-        ItemOffsetPair<Value, Offset> output;
-
-        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<Value>::IS_SMALL_INTEGER>());
-        output.offset = ReduceStep(input.offset, cub::Sum(), last_lane, offset, Int2Type<IsInteger<Offset>::IS_SMALL_INTEGER>());
-
-//        int last_value_lane = (input.offset > 0) ? 0 : last_lane;
-//        output.value = ReduceStep(input.value, cub::Sum(), last_value_lane, offset, Int2Type<IsInteger<Value>::IS_SMALL_INTEGER>());
-
-        if (input.offset > 0)
-            output.value = input.value;
-
-        return output;
-    }
-
-
-    /// Reduction step (generic)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T                  input,              ///< [in] Calling thread's input item.
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 last_lane,          ///< [in] Index of last lane in segment
-        int                 offset)             ///< [in] Up-offset to pull from
-    {
-        T output = input;
-
-        T temp = ShuffleDown(output, offset);
-
-        // Perform reduction op if valid
-        if (offset <= last_lane - lane_id)
-            output = reduction_op(temp, output);
-
-        return output;
-    }
-
-
-    /// Reduction step (specialized for small integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  is_small_integer)   ///< [in] Marker type indicating whether T is a small integer
-    {
-        unsigned int temp = reinterpret_cast<unsigned int &>(input);
-
-        temp = ReduceStep(temp, reduction_op, last_lane, offset);
-
-        return reinterpret_cast<_T&>(temp);
-    }
-
-    /// Reduction step (specialized for types other than small integers size 32b or less)
-    template <typename _T, typename ReductionOp>
-    __device__ __forceinline__ _T ReduceStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
-        int             last_lane,          ///< [in] Index of last lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> is_small_integer)   ///< [in] Marker type indicating whether T is a small integer
-    {
-        return ReduceStep(input, reduction_op, last_lane, offset);
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /// Reduction
-    template <
-        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int             FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename        ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T               input,                  ///< [in] Calling thread's input
-        int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
-    {
-        // Get the last thread in the logical warp
-        int first_warp_thread   = 0;
-        int last_warp_thread    = LOGICAL_WARP_THREADS - 1;
-        if (!IS_ARCH_WARP)
-        {
-            first_warp_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
-            last_warp_thread |= lane_id;
-        }
-
-        // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
-        int lanes_with_valid_data = (folded_items_per_warp - 1) / FOLDED_ITEMS_PER_LANE;
-
-        // Get the last valid lane
-        int last_lane = (ALL_LANES_VALID) ?
-            last_warp_thread :
-            CUB_MIN(last_warp_thread, first_warp_thread + lanes_with_valid_data);
-
-        T output = input;
-
-        // Iterate reduction steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_INTEGER>());
-        }
-
-        return output;
-    }
-
-
-    /// Segmented reduction
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = __ballot(flag);
-
-        if (HEAD_SEGMENTED)
-            warp_flags >>= 1;
-
-        // Mask in the last lanes of each logical warp
-        warp_flags |= LastLaneMask<1, LOGICAL_WARPS>::MASK;
-
-        // Mask out the bits below the current thread
-        warp_flags &= LaneMaskGe();
-
-        // Find the next set flag
-        int last_lane = __clz(__brev(warp_flags));
-
-        T output = input;
-
-        // Iterate reduction steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_INTEGER>());
-        }
-
-        return output;
-    }
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
deleted file mode 100644
index 55acc77ce..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh
+++ /dev/null
@@ -1,357 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being reduced
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpReduceSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-
-        /// Flag status (when not using ballot)
-        UNSET   = 0x0,  // Is initially unset
-        SET     = 0x1,  // Is initially set
-        SEEN    = 0x2,  // Has seen another head flag from a successor peer
-    };
-
-    /// Shared memory flag type
-    typedef unsigned char SmemFlag;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    struct _TempStorage
-    {
-        T           reduce[WARP_SMEM_ELEMENTS];
-        SmemFlag    flags[WARP_SMEM_ELEMENTS];
-    };
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpReduceSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    //---------------------------------------------------------------------
-    // Regular reduction
-    //---------------------------------------------------------------------
-
-    /**
-     * Reduction step
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp,
-        int                 STEP>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEP>      step)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share input through buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-        // Update input if peer_addend is in range
-        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) * FOLDED_ITEMS_PER_LANE < folded_items_per_warp))
-        {
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-            input = reduction_op(input, peer_addend);
-        }
-
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<STEP + 1>());
-    }
-
-
-    /**
-     * Reduction step (terminate)
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T ReduceStep(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op,           ///< [in] Reduction operator
-        Int2Type<STEPS>     step)
-    {
-        return input;
-    }
-
-
-    //---------------------------------------------------------------------
-    // Segmented reduction
-    //---------------------------------------------------------------------
-
-
-    /**
-     * Ballot-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,       ///< [in] Reduction operator
-        Int2Type<true>  has_ballot)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        // Get the start flags for each thread in the warp.
-        int warp_flags = __ballot(flag);
-
-        if (!HEAD_SEGMENTED)
-            warp_flags <<= 1;
-
-        // Keep bits above the current thread.
-        warp_flags &= LaneMaskGt();
-
-        // Accommodate packing of multiple logical warps in a single physical warp
-        if (!IS_ARCH_WARP)
-        {
-            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
-        }
-
-        // Find next flag
-        int next_flag = __clz(__brev(warp_flags));
-
-        // Clip the next segment at the warp boundary if necessary
-        if (LOGICAL_WARP_THREADS != 32)
-            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
-
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input into buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            // Update input if peer_addend is in range
-            if (OFFSET < next_flag - lane_id)
-            {
-                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-                input = reduction_op(input, peer_addend);
-            }
-        }
-
-        return input;
-    }
-
-
-    /**
-     * Smem-based segmented reduce
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op,       ///< [in] Reduction operator
-        Int2Type<false> has_ballot)         ///< [in] Marker type for whether the target arch has ballot functionality
-    {
-        enum
-        {
-            UNSET   = 0x0,  // Is initially unset
-            SET     = 0x1,  // Is initially set
-            SEEN    = 0x2,  // Has seen another head flag from a successor peer
-        };
-
-        // Alias flags onto shared data storage
-        volatile SmemFlag *flag_storage = temp_storage.flags;
-
-        SmemFlag flag_status = (flag) ? SET : UNSET;
-
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            const int OFFSET = 1 << STEP;
-
-            // Share input through buffer
-            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
-
-            // Get peer from buffer
-            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
-
-            // Share flag through buffer
-            flag_storage[lane_id] = flag_status;
-
-            // Get peer flag from buffer
-            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
-
-            // Update input if peer was in range
-            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
-            {
-                if (HEAD_SEGMENTED)
-                {
-                    // Head-segmented
-                    if ((flag_status & SEEN) == 0)
-                    {
-                        // Has not seen a more distant head flag
-                        if (peer_flag_status & SET)
-                        {
-                            // Has now seen a head flag
-                            flag_status |= SEEN;
-                        }
-                        else
-                        {
-                            // Peer is not a head flag: grab its count
-                            input = reduction_op(input, peer_addend);
-                        }
-
-                        // Update seen status to include that of peer
-                        flag_status |= (peer_flag_status & SEEN);
-                    }
-                }
-                else
-                {
-                    // Tail-segmented.  Simply propagate flag status
-                    if (!flag_status)
-                    {
-                        input = reduction_op(input, peer_addend);
-                        flag_status |= peer_flag_status;
-                    }
-
-                }
-            }
-        }
-
-        return input;
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /**
-     * Reduction
-     */
-    template <
-        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
-        int                 FOLDED_ITEMS_PER_LANE,  ///< Number of items folded into each lane
-        typename            ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,                  ///< [in] Calling thread's input
-        int                 folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
-        ReductionOp         reduction_op)           ///< [in] Reduction operator
-    {
-        return ReduceStep<ALL_LANES_VALID, FOLDED_ITEMS_PER_LANE>(input, folded_items_per_warp, reduction_op, Int2Type<0>());
-    }
-
-
-    /**
-     * Segmented reduction
-     */
-    template <
-        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
-        typename        Flag,
-        typename        ReductionOp>
-    __device__ __forceinline__ T SegmentedReduce(
-        T               input,              ///< [in] Calling thread's input
-        Flag            flag,               ///< [in] Whether or not the current lane is a segment head/tail
-        ReductionOp     reduction_op)       ///< [in] Reduction operator
-    {
-        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
deleted file mode 100644
index 702373c39..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh
+++ /dev/null
@@ -1,516 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../util_type.cuh"
-#include "../../util_ptx.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanShfl
-{
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
-        SHFL_C = ((-1 << STEPS) & 31) << 8,
-    };
-
-    template <typename S>
-    struct IsInteger
-    {
-        enum {
-            /// Whether the data type is a primitive integer
-            IS_INTEGER = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) || (Traits<S>::CATEGORY == SIGNED_INTEGER),
-
-            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
-            IS_SMALL_INTEGER = IS_INTEGER && (sizeof(S) <= sizeof(unsigned int))
-        };
-    };
-
-    /// Shared memory storage layout type
-    typedef NullType TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    int lane_id;
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanShfl(
-        TempStorage &temp_storage)
-    :
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Inclusive prefix scan step (specialized for summation across uint32 types)
-    __device__ __forceinline__ unsigned int InclusiveScanStep(
-        unsigned int    input,              ///< [in] Calling thread's input item.
-        cub::Sum        scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned int output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.u32 r0, r0, %4;"
-            "  mov.u32 %0, r0;"
-            "}"
-            : "=r"(output) : "r"(input), "r"(offset), "r"(first_lane), "r"(input));
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp32 types)
-    __device__ __forceinline__ float InclusiveScanStep(
-        float           input,              ///< [in] Calling thread's input item.
-        cub::Sum        scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        float output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .f32 r0;"
-            "  .reg .pred p;"
-            "  shfl.up.b32 r0|p, %1, %2, %3;"
-            "  @p add.f32 r0, r0, %4;"
-            "  mov.f32 %0, r0;"
-            "}"
-            : "=f"(output) : "f"(input), "r"(offset), "r"(first_lane), "f"(input));
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
-    __device__ __forceinline__ unsigned long long InclusiveScanStep(
-        unsigned long long  input,              ///< [in] Calling thread's input item.
-        cub::Sum            scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        unsigned long long output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.u64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(first_lane));
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across long long types)
-    __device__ __forceinline__ long long InclusiveScanStep(
-        long long       input,              ///< [in] Calling thread's input item.
-        cub::Sum        scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        long long output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.s64 %0, %0, %1;"
-            "}"
-            : "=l"(output) : "l"(input), "r"(offset), "r"(first_lane));
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for summation across fp64 types)
-    __device__ __forceinline__ double InclusiveScanStep(
-        double          input,              ///< [in] Calling thread's input item.
-        cub::Sum        scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        double output;
-
-        // Use predicate set from SHFL to guard against invalid peers
-        asm(
-            "{"
-            "  .reg .u32 lo;"
-            "  .reg .u32 hi;"
-            "  .reg .pred p;"
-            "  mov.b64 {lo, hi}, %1;"
-            "  shfl.up.b32 lo|p, lo, %2, %3;"
-            "  shfl.up.b32 hi|p, hi, %2, %3;"
-            "  mov.b64 %0, {lo, hi};"
-            "  @p add.f64 %0, %0, %1;"
-            "}"
-            : "=d"(output) : "d"(input), "r"(offset), "r"(first_lane));
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across ItemOffsetPair<Value, Offset> types)
-    template <typename Value, typename Offset>
-    __device__ __forceinline__ ItemOffsetPair<Value, Offset> InclusiveScanStep(
-        ItemOffsetPair<Value, Offset>                               input,              ///< [in] Calling thread's input item.
-        ReduceBySegmentOp<cub::Sum, ItemOffsetPair<Value, Offset> > scan_op,            ///< [in] Binary scan operator
-        int                                                         first_lane,         ///< [in] Index of first lane in segment
-        int                                                         offset)             ///< [in] Up-offset to pull from
-    {
-        ItemOffsetPair<Value, Offset> output;
-
-        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IsInteger<Value>::IS_SMALL_INTEGER>());
-        output.offset = InclusiveScanStep(input.offset, cub::Sum(), first_lane, offset, Int2Type<IsInteger<Offset>::IS_SMALL_INTEGER>());
-
-        if (input.offset > 0)
-            output.value = input.value;
-
-/*
-        int first_value_lane = (input.offset > 0) ? LOGICAL_WARP_THREADS - 1 : first_lane;
-        output.value = InclusiveScanStep(input.value, cub::Sum(), first_value_lane, offset, Int2Type<IsInteger<Value>::IS_SMALL_INTEGER>());
-*/
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (generic)
-    template <typename _T, typename ScanOp>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset)             ///< [in] Up-offset to pull from
-    {
-        T output = input;
-
-        T temp = ShuffleUp(output, offset);
-
-        // Perform scan op if from a valid peer
-        if (lane_id >= offset)
-            output = scan_op(temp, output);
-
-        return output;
-    }
-
-
-    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
-    template <typename _T, typename ScanOp>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<true>  is_small_integer)   ///< [in] Marker type indicating whether T is a small integer
-    {
-        unsigned int temp = reinterpret_cast<unsigned int &>(input);
-
-        temp = InclusiveScanStep(temp, scan_op, first_lane, offset);
-
-        return reinterpret_cast<_T&>(temp);
-    }
-
-
-    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
-    template <typename _T, typename ScanOp>
-    __device__ __forceinline__ _T InclusiveScanStep(
-        _T              input,              ///< [in] Calling thread's input item.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        int             first_lane,         ///< [in] Index of first lane in segment
-        int             offset,             ///< [in] Up-offset to pull from
-        Int2Type<false> is_small_integer)   ///< [in] Marker type indicating whether T is a small integer
-    {
-        return InclusiveScanStep(input, scan_op, first_lane, offset);
-    }
-
-
-    /// Get exclusive from inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ T GetExclusive(
-        T               input,
-        T               inclusive,
-        cub::Sum        scan_op,
-        Int2Type<true>  is_integer)
-    {
-        return inclusive - input;
-    }
-
-
-    /// Get exclusive from inclusive (specialized for scans other than summation of integer types)
-    template <typename ScanOp, int _IS_INTEGER>
-    __device__ __forceinline__ T GetExclusive(
-        T                       input,
-        T                       inclusive,
-        ScanOp                  scan_op,
-        Int2Type<_IS_INTEGER>   is_integer)
-    {
-        return ShuffleUp(inclusive, 1);
-    }
-
-    /// Get exclusive from inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ T GetExclusive(
-        T               input,
-        T               inclusive,
-        T               identity,
-        cub::Sum        scan_op,
-        Int2Type<true>  is_integer)
-    {
-        return inclusive - input;
-    }
-
-
-    /// Get exclusive from inclusive (specialized for scans other than summation of integer types)
-    template <typename ScanOp, int _IS_INTEGER>
-    __device__ __forceinline__ T GetExclusive(
-        T                       input,
-        T                       inclusive,
-        T                       identity,
-        ScanOp                  scan_op,
-        Int2Type<_IS_INTEGER>   is_integer)
-    {
-        T exclusive = ShuffleUp(inclusive, 1);
-        return (lane_id == 0) ? identity : exclusive;
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return ShuffleBroadcast(input, src_lane, LOGICAL_WARP_THREADS);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        output = input;
-
-        // Iterate scan steps
-        #pragma unroll
-        for (int STEP = 0; STEP < STEPS; STEP++)
-        {
-            output = InclusiveScanStep(output, scan_op, SHFL_C, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_INTEGER>());
-        }
-    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, output, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Combo (inclusive & exclusive) operations
-    //---------------------------------------------------------------------
-
-    /// Combination scan without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive scan
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab result from predecessor
-        exclusive_output = GetExclusive(input, inclusive_output, scan_op, Int2Type<IsInteger<T>::IS_INTEGER>());
-    }
-
-    /// Combination scan with identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive scan
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab result from predecessor
-        exclusive_output = GetExclusive(input, inclusive_output, identity, scan_op, Int2Type<IsInteger<T>::IS_INTEGER>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Exclusive operations
-    //---------------------------------------------------------------------
-
-    /// Exclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, identity, scan_op);
-    }
-
-
-    /// Exclusive scan with aggregate, without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, scan_op);
-    }
-
-
-    /// Exclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, identity, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1);
-    }
-
-
-    /// Exclusive scan with aggregate, without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, scan_op);
-
-        // Grab aggregate from last warp lane
-        warp_aggregate = Broadcast(inclusive_output, LOGICAL_WARP_THREADS - 1);
-    }
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh b/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
deleted file mode 100644
index 334d08bc5..000000000
--- a/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh
+++ /dev/null
@@ -1,402 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "../../thread/thread_operators.cuh"
-#include "../../thread/thread_load.cuh"
-#include "../../thread/thread_store.cuh"
-#include "../../util_type.cuh"
-#include "../../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-template <
-    typename    T,                      ///< Data type being scanned
-    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
-    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
-struct WarpScanSmem
-{
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// The number of warp scan steps
-        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
-
-        /// The number of threads in half a warp
-        HALF_WARP_THREADS = 1 << (STEPS - 1),
-
-        /// The number of shared memory elements per warp
-        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
-
-        /// Whether the data type is a primitive integer
-        IS_INTEGER = (Traits<T>::CATEGORY == UNSIGNED_INTEGER) || (Traits<T>::CATEGORY == SIGNED_INTEGER),
-
-    };
-
-    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
-    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
-
-    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
-    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
-
-    // Alias wrapper allowing storage to be unioned
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    _TempStorage    &temp_storage;
-    unsigned int    lane_id;
-
-
-    /******************************************************************************
-     * Construction
-     ******************************************************************************/
-
-    /// Constructor
-    __device__ __forceinline__ WarpScanSmem(
-        TempStorage     &temp_storage)
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        int         STEP,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T               &partial,
-        ScanOp          scan_op,
-        Int2Type<STEP>  step)
-    {
-        const int OFFSET = 1 << STEP;
-
-        // Share partial into buffer
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
-
-        // Update partial if addend is in range
-        if (HAS_IDENTITY || (lane_id >= OFFSET))
-        {
-            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
-            partial = scan_op(addend, partial);
-        }
-
-        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
-    }
-
-
-    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
-    template <
-        bool        HAS_IDENTITY,
-        typename    ScanOp>
-    __device__ __forceinline__ void ScanStep(
-        T               &partial,
-        ScanOp          scan_op,
-        Int2Type<STEPS>  step)
-    {}
-
-
-    /// Inclusive prefix scan with identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
-
-        // Iterate scan steps
-        output = input;
-        ScanStep<true>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /// Inclusive prefix scan (specialized for summation across primitive types)
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        Sum             scan_op,            ///< [in] Binary scan operator
-        Int2Type<true>  is_primitive)       ///< [in] Marker type indicating whether T is primitive type
-    {
-        T identity = ZeroInitialize<T>();
-        InclusiveScan(input, output, identity, scan_op);
-    }
-
-
-    /// Inclusive prefix scan
-    template <typename ScanOp, int IS_PRIMITIVE>
-    __device__ __forceinline__ void InclusiveScan(
-        T                       input,              ///< [in] Calling thread's input item.
-        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp                  scan_op,            ///< [in] Binary scan operator
-        Int2Type<IS_PRIMITIVE>  is_primitive)       ///< [in] Marker type indicating whether T is primitive type
-    {
-        // Iterate scan steps
-        output = input;
-        ScanStep<false>(output, scan_op, Int2Type<0>());
-    }
-
-
-    /// Get exclusive from inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ T GetExclusive(
-        T               input,
-        T               inclusive,
-        Sum             scan_op,
-        Int2Type<true>  is_integer)
-    {
-        return inclusive - input;
-    }
-
-
-    /// Get exclusive from inclusive (specialized for scans other than summation of integer types)
-    template <typename ScanOp, int _IS_INTEGER>
-    __device__ __forceinline__ T GetExclusive(
-        T                       input,
-        T                       inclusive,
-        ScanOp                  scan_op,
-        Int2Type<_IS_INTEGER>   is_integer)
-    {
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-        return (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-
-    /// Get exclusive from inclusive (specialized for summation of integer types)
-    __device__ __forceinline__ T GetExclusive(
-        T               input,
-        T               inclusive,
-        Sum             scan_op,
-        T               &warp_aggregate,
-        Int2Type<true>  is_integer)
-    {
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        return inclusive - input;
-    }
-
-
-    /// Get exclusive from inclusive (specialized for scans other than summation of integer types)
-    template <typename ScanOp, int _IS_INTEGER>
-    __device__ __forceinline__ T GetExclusive(
-        T                       input,
-        T                       inclusive,
-        ScanOp                  scan_op,
-        T                       &warp_aggregate,
-        Int2Type<_IS_INTEGER>   is_integer)
-    {
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-
-        return (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
-    }
-
-
-    /******************************************************************************
-     * Interface
-     ******************************************************************************/
-
-    /// Broadcast
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        if (lane_id == src_lane)
-        {
-            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
-        }
-
-        return (T) ThreadLoad<LOAD_VOLATILE>(temp_storage);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Inclusive operations
-    //---------------------------------------------------------------------
-
-    /// Inclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InclusiveScan(input, output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());    }
-
-
-    /// Inclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InclusiveScan(input, output, scan_op);
-
-        // Retrieve aggregate
-        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) output);
-        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
-    }
-
-
-    //---------------------------------------------------------------------
-    // Combo (inclusive & exclusive) operations
-    //---------------------------------------------------------------------
-
-    /// Combination scan without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive scan
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab result from predecessor
-        exclusive_output = GetExclusive(input, inclusive_output, scan_op, Int2Type<IS_INTEGER>());
-    }
-
-    /// Combination scan with identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        // Compute inclusive scan
-        InclusiveScan(input, inclusive_output, identity, scan_op);
-
-        // Grab result from predecessor
-        exclusive_output = GetExclusive(input, inclusive_output, scan_op, Int2Type<IS_INTEGER>());
-    }
-
-
-    //---------------------------------------------------------------------
-    // Exclusive operations
-    //---------------------------------------------------------------------
-
-    /// Exclusive scan
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, identity, scan_op);
-    }
-
-
-    /// Exclusive scan without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        T inclusive_output;
-        Scan(input, inclusive_output, output, scan_op);
-    }
-
-    /// Exclusive scan with aggregate
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Compute inclusive scan
-        T inclusive_output;
-        InclusiveScan(input, inclusive_output, identity, scan_op);
-
-        // Grab result from predecessor
-        output = GetExclusive(input, inclusive_output, scan_op, warp_aggregate, Int2Type<IS_INTEGER>());
-    }
-
-
-    /// Exclusive scan with aggregate, without identity
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        // Compute inclusive scan
-        T inclusive_output;
-        InclusiveScan(input, inclusive_output, scan_op);
-
-        // Grab result from predecessor
-        output = GetExclusive(input, inclusive_output, scan_op, warp_aggregate, Int2Type<IS_INTEGER>());
-    }
-
-
-};
-
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh b/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
deleted file mode 100644
index 7c951ed47..000000000
--- a/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh
+++ /dev/null
@@ -1,612 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_reduce_shfl.cuh"
-#include "specializations/warp_reduce_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
- *
- * \tparam T                        The reduction input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
- *   uses a binary combining operator to compute a single aggregate from a list of input elements.
- * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic reduction)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpReduce}
- * \par
- * The code snippet below illustrates four concurrent warp sum reductions within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for 4 warps
- *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
- *     int warp_id = threadIdx.x / 32;
- *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
- * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
- * \p 2544, and \p 3568, respectively (and is undefined in other threads).
- *
- * \par
- * The code snippet below illustrates a single warp sum reduction within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpReduce for type int
- *     typedef cub::WarpReduce<int> WarpReduce;
- *
- *     // Allocate WarpReduce shared memory for one warp
- *     __shared__ typename WarpReduce::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a reduction
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Return the warp-wide sum to lane0
- *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
- * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpReduce
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
-    };
-
-public:
-
-    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
-
-    #endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-private:
-
-    /// Shared memory storage layout type for WarpReduce
-    typedef typename InternalWarpReduce::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage &temp_storage;
-
-
-    /******************************************************************************
-     * Utility methods
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpReduce}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpReduce(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias())
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Summation reductions
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp sum reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
-     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input)              ///< [in] Calling thread's input
-    {
-        return InternalWarpReduce(temp_storage).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, cub::Sum());
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a sum reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Sum(
-     *         thread_data, valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
-     * undefined in other threads).
-     *
-     */
-    __device__ __forceinline__ T Sum(
-        T                   input,              ///< [in] Calling thread's input
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        // Determine if we don't need bounds checking
-        return InternalWarpReduce(temp_storage).Reduce<false, 1>(input, valid_items, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
-     *         thread_data, head_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     *
-     */
-    template <
-        typename            Flag>
-    __device__ __forceinline__ T HeadSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return HeadSegmentedReduce(input, head_flag, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp sum
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide sums to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
-     *         thread_data, tail_flag);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            Flag>
-    __device__ __forceinline__ T TailSegmentedSum(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-    {
-        return TailSegmentedReduce(input, tail_flag, cub::Sum());
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Generic reductions
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp max reductions within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for 4 warps
-     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int warp_id = threadIdx.x / 32;
-     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
-     *         thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
-     * \p 95, and \p 127, respectively  (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
-    {
-        return InternalWarpReduce(temp_storage).Reduce<true, 1>(input, LOGICAL_WARP_THREADS, reduction_op);
-    }
-
-    /**
-     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
-     *
-     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a max reduction within a single, partially-full
-     * block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(int *d_data, int valid_items)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item per thread if in range
-     *     int thread_data;
-     *     if (threadIdx.x < valid_items)
-     *         thread_data = d_data[threadIdx.x];
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).Reduce(
-     *         thread_data, cub::Max(), valid_items);
-     *
-     * \endcode
-     * \par
-     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
-     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
-     * undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ReductionOp>
-    __device__ __forceinline__ T Reduce(
-        T                   input,              ///< [in] Calling thread's input
-        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
-        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
-    {
-        return InternalWarpReduce(temp_storage).Reduce<false, 1>(input, valid_items, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a head-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int head_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
-     *         thread_data, head_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            Flag>
-    __device__ __forceinline__ T HeadSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
-    }
-
-
-    /**
-     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
-     *
-     * Supports non-commutative reduction operators
-     *
-     * \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates a tail-segmented warp max
-     * reduction within a block of 32 threads (one warp).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpReduce for type int
-     *     typedef cub::WarpReduce<int> WarpReduce;
-     *
-     *     // Allocate WarpReduce shared memory for one warp
-     *     __shared__ typename WarpReduce::TempStorage temp_storage;
-     *
-     *     // Obtain one input item and flag per thread
-     *     int thread_data = ...
-     *     int tail_flag = ...
-     *
-     *     // Return the warp-wide reductions to each lane0
-     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
-     *         thread_data, tail_flag, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
-     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
-     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
-     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
-     *
-     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <
-        typename            ReductionOp,
-        typename            Flag>
-    __device__ __forceinline__ T TailSegmentedReduce(
-        T                   input,              ///< [in] Calling thread's input
-        Flag                tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
-        ReductionOp         reduction_op)       ///< [in] Reduction operator
-    {
-        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
-    }
-
-
-
-    //@}  end member group
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh b/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
deleted file mode 100644
index 01e375624..000000000
--- a/thrust/system/cuda/detail/cub/warp/warp_scan.cuh
+++ /dev/null
@@ -1,924 +0,0 @@
-/******************************************************************************
- * Copyright (c) 2011, Duane Merrill.  All rights reserved.
- * Copyright (c) 2011-2014, NVIDIA CORPORATION.  All rights reserved.
- * 
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- *     * Redistributions of source code must retain the above copyright
- *       notice, this list of conditions and the following disclaimer.
- *     * Redistributions in binary form must reproduce the above copyright
- *       notice, this list of conditions and the following disclaimer in the
- *       documentation and/or other materials provided with the distribution.
- *     * Neither the name of the NVIDIA CORPORATION nor the
- *       names of its contributors may be used to endorse or promote products
- *       derived from this software without specific prior written permission.
- * 
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- ******************************************************************************/
-
-/**
- * \file
- * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
- */
-
-#pragma once
-
-#include "specializations/warp_scan_shfl.cuh"
-#include "specializations/warp_scan_smem.cuh"
-#include "../thread/thread_operators.cuh"
-#include "../util_arch.cuh"
-#include "../util_type.cuh"
-#include "../util_namespace.cuh"
-
-/// Optional outer namespace(s)
-CUB_NS_PREFIX
-
-/// CUB namespace
-namespace cub {
-
-/**
- * \addtogroup WarpModule
- * @{
- */
-
-/**
- * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
- *
- * \tparam T                        The scan input/output element type
- * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
- * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
- *
- * \par Overview
- * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
- *   produces an output list where each element is computed to be the reduction
- *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
- *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
- *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
- *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
- *   the <em>i</em><sup>th</sup> output reduction.
- * - Supports non-commutative scan operators
- * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
- * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Performance Considerations
- * - Uses special instructions when applicable (e.g., warp \p SHFL)
- * - Uses synchronization-free communication between warp lanes when applicable
- * - Incurs zero bank conflicts for most types
- * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
- *     - Summation (<b><em>vs.</em></b> generic scan)
- *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
- *
- * \par Simple Examples
- * \warpcollective{WarpScan}
- * \par
- * The code snippet below illustrates four concurrent warp prefix sums within a block of
- * 128 threads (one per each of the 32-thread warps).
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for 4 warps
- *     __shared__ typename WarpScan::TempStorage temp_storage[4];
- *
- *     // Obtain one input item per thread
- *     int thread_data = ...
- *
- *     // Compute warp-wide prefix sums
- *     int warp_id = threadIdx.x / 32;
- *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data in each of the four warps of threads will be
- * <tt>0, 1, 2, 3, ..., 31}</tt>.
- *
- * \par
- * The code snippet below illustrates a single warp prefix sum within a block of
- * 128 threads.
- * \par
- * \code
- * #include <cub/cub.cuh>
- *
- * __global__ void ExampleKernel(...)
- * {
- *     // Specialize WarpScan for type int
- *     typedef cub::WarpScan<int> WarpScan;
- *
- *     // Allocate WarpScan shared memory for one warp
- *     __shared__ typename WarpScan::TempStorage temp_storage;
- *     ...
- *
- *     // Only the first warp performs a prefix sum
- *     if (threadIdx.x < 32)
- *     {
- *         // Obtain one input item per thread
- *         int thread_data = ...
- *
- *         // Compute warp-wide prefix sums
- *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
- *
- * \endcode
- * \par
- * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
- * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
- *
- */
-template <
-    typename    T,
-    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
-    int         PTX_ARCH                = CUB_PTX_ARCH>
-class WarpScan
-{
-private:
-
-    /******************************************************************************
-     * Constants and type definitions
-     ******************************************************************************/
-
-    enum
-    {
-        /// Whether the logical warp size and the PTX warp size coincide
-        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
-
-        /// Whether the logical warp size is a power-of-two
-        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
-
-        /// Whether the data type is an integer (which has fully-associative addition)
-        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
-    };
-
-    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
-    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
-        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
-        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
-
-    /// Shared memory storage layout type for WarpScan
-    typedef typename InternalWarpScan::TempStorage _TempStorage;
-
-
-    /******************************************************************************
-     * Thread fields
-     ******************************************************************************/
-
-    /// Shared storage reference
-    _TempStorage    &temp_storage;
-    int             lane_id;
-
-
-
-    /******************************************************************************
-     * Public types
-     ******************************************************************************/
-
-public:
-
-    /// \smemstorage{WarpScan}
-    struct TempStorage : Uninitialized<_TempStorage> {};
-
-
-    /******************************************************************//**
-     * \name Collective constructors
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
-     */
-    __device__ __forceinline__ WarpScan(
-        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
-    :
-        temp_storage(temp_storage.Alias()),
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS)
-    {}
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void InclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, output, cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix sums
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.
-     *
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output)            ///< [out] Calling thread's output item.  May be aliased with \p input.
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, ZeroInitialize<T>(), cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix sums
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p thread_data in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
-     */
-    __device__ __forceinline__ void ExclusiveSum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, ZeroInitialize<T>(), cub::Sum(), warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Inclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
-     *         thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void InclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).InclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, identity, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, identity, scan_op, warp_aggregate);
-    }
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Identityless exclusive prefix scans
-     *********************************************************************/
-    //@{
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, scan_op);
-    }
-
-
-    /**
-     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no identity value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int warp_aggregate;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p thread_data in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
-     * in the second warp, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void ExclusiveScan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
-        ScanOp          scan_op,            ///< [in] Binary scan operator
-        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
-    {
-        InternalWarpScan(temp_storage).ExclusiveScan(input, output, scan_op, warp_aggregate);
-    }
-
-
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Combination (inclusive & exclusive) prefix scans
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix sums across the calling warp.
-     *
-     * \par
-     *  - \identityzero
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide prefix sums within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute in|exclusive warp-wide prefix sums
-     *     int inclusive_partial, exclusive_partial;
-     *     int warp_id = threadIdx.x / 32;
-     *     WarpScan(temp_storage[warp_id]).Sum(thread_data, inclusive_partial, exclusive_partial);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
-     * The corresponding output \p inclusive_partial in each of the four warps of threads will be
-     * <tt>1, 2, 3, ..., 32}</tt>.
-     * The corresponding output \p exclusive_partial in each of the four warps of threads will be
-     * <tt>0, 1, 2, ..., 31}</tt>.
-     *
-     */
-    __device__ __forceinline__ void Sum(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output)  ///< [out] Calling thread's exclusive-scan output item.
-    {
-        InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, ZeroInitialize<T>(), cub::Sum());
-    }
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
-     *
-     * \par
-     *  - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute inclusive warp-wide prefix max scans
-     *     int warp_id = threadIdx.x / 32;
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        T               identity,           ///< [in] Identity value
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, identity, scan_op);
-    }
-
-
-    /**
-     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no identity value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
-     * 128 threads (one per each of the 32-thread warps).
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Compute exclusive warp-wide prefix max scans
-     *     int inclusive_partial, exclusive_partial;
-     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
-     * The corresponding output \p inclusive_partial in the first warp would be
-     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
-     * The corresponding output \p exclusive_partial in the first warp would be
-     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
-     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
-     *
-     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
-     */
-    template <typename ScanOp>
-    __device__ __forceinline__ void Scan(
-        T               input,              ///< [in] Calling thread's input item.
-        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
-        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
-        ScanOp          scan_op)            ///< [in] Binary scan operator
-    {
-        InternalWarpScan(temp_storage).Scan(input, inclusive_output, exclusive_output, scan_op);
-    }
-
-    //@}  end member group
-    /******************************************************************//**
-     * \name Data exchange
-     *********************************************************************/
-    //@{
-
-    /**
-     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
-     *
-     * \par
-     * - \smemreuse
-     *
-     * \par Snippet
-     * The code snippet below illustrates the warp-wide broadcasts of values from
-     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
-     * \par
-     * \code
-     * #include <cub/cub.cuh>
-     *
-     * __global__ void ExampleKernel(...)
-     * {
-     *     // Specialize WarpScan for type int
-     *     typedef cub::WarpScan<int> WarpScan;
-     *
-     *     // Allocate WarpScan shared memory for 4 warps
-     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
-     *
-     *     // Obtain one input item per thread
-     *     int thread_data = ...
-     *
-     *     // Broadcast from lane0 in each warp to all other threads in the warp
-     *     int warp_id = threadIdx.x / 32;
-     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
-     *
-     * \endcode
-     * \par
-     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
-     * The corresponding output \p thread_data will be
-     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
-     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
-     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
-     */
-    __device__ __forceinline__ T Broadcast(
-        T               input,              ///< [in] The value to broadcast
-        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
-    {
-        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
-    }
-
-    //@}  end member group
-
-};
-
-/** @} */       // end group WarpModule
-
-}               // CUB namespace
-CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/thrust/system/cuda/detail/cuda_launch_config.h b/thrust/system/cuda/detail/cuda_launch_config.h
deleted file mode 100644
index 1d703bf9d..000000000
--- a/thrust/system/cuda/detail/cuda_launch_config.h
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct device_properties_t
-{
-  // mirror the type and spelling of cudaDeviceProp's members
-  // keep these alphabetized
-  int    major;
-  int    maxGridSize[3];
-  int    maxThreadsPerBlock;
-  int    maxThreadsPerMultiProcessor;
-  int    minor;
-  int    multiProcessorCount;
-  int    regsPerBlock;
-  size_t sharedMemPerBlock;
-  int    warpSize;
-};
-
-
-// XXX define our own device_properties_t to avoid errors when #including
-//     this file in the absence of a CUDA installation
-struct function_attributes_t
-{
-  // mirror the type and spelling of cudaFuncAttributes' members
-  // keep these alphabetized
-  size_t constSizeBytes;
-  size_t localSizeBytes;
-  int    maxThreadsPerBlock;
-  int    numRegs;
-  int    ptxVersion;
-  size_t sharedSizeBytes;
-};
-
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- *  \note The __global__ function of interest is presumed to use 0 bytes of dynamically-allocated __shared__ memory.
- */
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties);
-
-/*! Computes a block size in number of threads for a CUDA kernel using a occupancy-promoting heuristic.
- *  Use this version of the function when a CUDA block's dynamically-allocated __shared__ memory requirements
- *  vary with the size of the block.
- *  \param attributes The cudaFuncAttributes corresponding to a __global__ function of interest on a GPU of interest.
- *  \param properties The cudaDeviceProp corresponding to a GPU on which to launch the __global__ function of interest.
- *  \param block_size_to_dynamic_smem_bytes A unary function which maps an integer CUDA block size to the number of bytes
- *         of dynamically-allocated __shared__ memory required by a CUDA block of that size.
- *  \return A CUDA block size, in number of threads, which the resources of the GPU's streaming multiprocessor can
- *          accomodate and which is intended to promote occupancy. The result is equivalent to the one performed by
- *          the "CUDA Occupancy Calculator". 
- */
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size);
-
-
-/*! Returns the maximum amount of dynamic shared memory each block
- *  can utilize without reducing thread occupancy.
- *
- *  \param properties CUDA device properties
- *  \param attributes CUDA function attributes
- *  \param blocks_per_processor Number of blocks per streaming multiprocessor
- */
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor);
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage);
-
-
-
-namespace cuda_launch_config_detail
-{
-
-using std::size_t;
-
-namespace util
-{
-
-
-template<typename T>
-inline __host__ __device__
-T min_(const T &lhs, const T &rhs)
-{
-  return rhs < lhs ? rhs : lhs;
-}
-
-
-template <typename T>
-struct zero_function
-{
-  inline __host__ __device__
-  T operator()(T)
-  {
-    return 0;
-  }
-};
-
-
-// x/y rounding towards +infinity for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_ri(const L x, const R y)
-{
-    return (x + (y - 1)) / y;
-}
-
-// x/y rounding towards zero for integers, used to determine # of blocks/warps etc.
-template<typename L, typename R>
-  inline __host__ __device__ L divide_rz(const L x, const R y)
-{
-    return x / y;
-}
-
-// round x towards infinity to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_i(const L x, const R y){ return y * divide_ri(x, y); }
-
-// round x towards zero to the next multiple of y
-template<typename L, typename R>
-  inline __host__ __device__ L round_z(const L x, const R y){ return y * divide_rz(x, y); }
-
-} // end namespace util
-
-
-
-// granularity of shared memory allocation
-inline __host__ __device__
-size_t smem_allocation_unit(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 512;
-    case 2:  return 128;
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of register allocation
-inline __host__ __device__
-int reg_allocation_unit(const device_properties_t &properties, const size_t regsPerThread)
-{
-  switch(properties.major)
-  {
-    case 1:  return (properties.minor <= 1) ? 256 : 512;
-    case 2:  switch(regsPerThread)
-             {
-               case 21:
-               case 22:
-               case 29:
-               case 30:
-               case 37:
-               case 38:
-               case 45:
-               case 46:
-                 return 128;
-               default:
-                 return 64;
-             }
-    case 3:  return 256;
-    default: return 256; // unknown GPU; have to guess
-  }
-}
-
-
-// granularity of warp allocation
-inline __host__ __device__
-size_t warp_allocation_multiple(const device_properties_t &properties)
-{
-  return (properties.major <= 1) ? 2 : 1;
-}
-
-// number of "sides" into which the multiprocessor is partitioned
-inline __host__ __device__
-size_t num_sides_per_multiprocessor(const device_properties_t &properties)
-{
-  switch(properties.major)
-  {
-    case 1:  return 1;
-    case 2:  return 2;
-    case 3:  return 4;
-    default: return 4; // unknown GPU; have to guess
-  }
-}
-
-
-inline __host__ __device__
-size_t max_blocks_per_multiprocessor(const device_properties_t &properties)
-{
-  return (properties.major <= 2) ? 8 : 16;
-}
-
-
-inline __host__ __device__
-size_t max_active_blocks_per_multiprocessor(const device_properties_t    &properties,
-                                            const function_attributes_t  &attributes,
-                                            size_t CTA_SIZE,
-                                            size_t dynamic_smem_bytes)
-{
-  // Determine the maximum number of CTAs that can be run simultaneously per SM
-  // This is equivalent to the calculation done in the CUDA Occupancy Calculator spreadsheet
-
-  //////////////////////////////////////////
-  // Limits due to threads/SM or blocks/SM
-  //////////////////////////////////////////
-  const size_t maxThreadsPerSM = properties.maxThreadsPerMultiProcessor;  // 768, 1024, 1536, etc.
-  const size_t maxBlocksPerSM  = max_blocks_per_multiprocessor(properties);
-
-  // Calc limits
-  const size_t ctaLimitThreads = (CTA_SIZE <= size_t(properties.maxThreadsPerBlock)) ? maxThreadsPerSM / CTA_SIZE : 0;
-  const size_t ctaLimitBlocks  = maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to shared memory/SM
-  //////////////////////////////////////////
-  const size_t smemAllocationUnit     = smem_allocation_unit(properties);
-  const size_t smemBytes  = attributes.sharedSizeBytes + dynamic_smem_bytes;
-  const size_t smemPerCTA = util::round_i(smemBytes, smemAllocationUnit);
-
-  // Calc limit
-  const size_t ctaLimitSMem = smemPerCTA > 0 ? properties.sharedMemPerBlock / smemPerCTA : maxBlocksPerSM;
-
-  //////////////////////////////////////////
-  // Limits due to registers/SM
-  //////////////////////////////////////////
-  const int regAllocationUnit = reg_allocation_unit(properties, attributes.numRegs);
-  const size_t warpAllocationMultiple = warp_allocation_multiple(properties);
-  const size_t numWarps = util::round_i(util::divide_ri(CTA_SIZE, properties.warpSize), warpAllocationMultiple);
-
-  // Calc limit
-  size_t ctaLimitRegs;
-  if(properties.major <= 1)
-  {
-    // GPUs of compute capability 1.x allocate registers to CTAs
-    // Number of regs per block is regs per thread times number of warps times warp size, rounded up to allocation unit
-    const size_t regsPerCTA = util::round_i(attributes.numRegs * properties.warpSize * numWarps, regAllocationUnit);
-    ctaLimitRegs = regsPerCTA > 0 ? properties.regsPerBlock / regsPerCTA : maxBlocksPerSM;
-  }
-  else
-  {
-    // GPUs of compute capability 2.x and higher allocate registers to warps
-    // Number of regs per warp is regs per thread times times warp size, rounded up to allocation unit
-    const size_t regsPerWarp = util::round_i(attributes.numRegs * properties.warpSize, regAllocationUnit);
-    const size_t numSides = num_sides_per_multiprocessor(properties);
-    const size_t numRegsPerSide = properties.regsPerBlock / numSides;
-    ctaLimitRegs = regsPerWarp > 0 ? ((numRegsPerSide / regsPerWarp) * numSides) / numWarps : maxBlocksPerSM;
-  }
-
-  //////////////////////////////////////////
-  // Overall limit is min() of limits due to above reasons
-  //////////////////////////////////////////
-  return util::min_(ctaLimitRegs, util::min_(ctaLimitSMem, util::min_(ctaLimitThreads, ctaLimitBlocks)));
-}
-
-
-} // end namespace cuda_launch_config_detail
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties,
-                                                        UnaryFunction block_size_to_dynamic_smem_size)
-{
-  size_t max_occupancy      = properties.maxThreadsPerMultiProcessor;
-  size_t largest_blocksize  = cuda_launch_config_detail::util::min_(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity        = properties.warpSize;
-  size_t max_blocksize      = 0;
-  size_t highest_occupancy  = 0;
-
-  for(size_t blocksize = largest_blocksize; blocksize != 0; blocksize -= granularity)
-  {
-    size_t occupancy = blocksize * cuda_launch_config_detail::max_active_blocks_per_multiprocessor(properties, attributes, blocksize, block_size_to_dynamic_smem_size(blocksize));
-
-    if(occupancy > highest_occupancy)
-    {
-      max_blocksize = blocksize;
-      highest_occupancy = occupancy;
-    }
-
-    // early out, can't do better
-    if(highest_occupancy == max_occupancy)
-      break;
-  }
-
-  return max_blocksize;
-}
-
-
-inline __host__ __device__
-std::size_t block_size_with_maximum_potential_occupancy(const function_attributes_t &attributes,
-                                                        const device_properties_t   &properties)
-{
-  return block_size_with_maximum_potential_occupancy(attributes, properties, cuda_launch_config_detail::util::zero_function<std::size_t>());
-}
-
-
-inline __host__ __device__
-size_t proportional_smem_allocation(const device_properties_t   &properties,
-                                    const function_attributes_t &attributes,
-                                    size_t blocks_per_processor)
-{
-  size_t smem_per_processor    = properties.sharedMemPerBlock;
-  size_t smem_allocation_unit  = cuda_launch_config_detail::smem_allocation_unit(properties);
-
-  size_t total_smem_per_block  = cuda_launch_config_detail::util::round_z(smem_per_processor / blocks_per_processor, smem_allocation_unit);
-  size_t static_smem_per_block = attributes.sharedSizeBytes;
-  
-  return total_smem_per_block - static_smem_per_block;
-}
-
-
-template<typename UnaryFunction>
-inline __host__ __device__
-size_t max_blocksize_subject_to_smem_usage(const device_properties_t   &properties,
-                                           const function_attributes_t &attributes,
-                                           UnaryFunction blocksize_to_dynamic_smem_usage)
-{
-  size_t largest_blocksize = (thrust::min)(properties.maxThreadsPerBlock, attributes.maxThreadsPerBlock);
-  size_t granularity = properties.warpSize;
-  
-  for(int blocksize = largest_blocksize; blocksize > 0; blocksize -= granularity)
-  {
-    size_t total_smem_usage = blocksize_to_dynamic_smem_usage(blocksize) + attributes.sharedSizeBytes;
-
-    if(total_smem_usage <= properties.sharedMemPerBlock)
-    {
-      return blocksize;
-    }
-  }
-
-  return 0;
-}
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/decomposition.h b/thrust/system/cuda/detail/decomposition.h
deleted file mode 100644
index 403d84ac6..000000000
--- a/thrust/system/cuda/detail/decomposition.h
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/pair.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename Size>
-class trivial_decomposition
-{
-  public:
-    typedef Size size_type;
-
-    typedef thrust::pair<size_type,size_type> range;
-
-    __host__ __device__
-    trivial_decomposition()
-      : m_n(0)
-    {}
-
-    __host__ __device__
-    trivial_decomposition(size_type n)
-      : m_n(n)
-    {}
-
-    __host__ __device__
-    range operator[](size_type) const
-    {
-      return range(0, n());
-    }
-
-    __host__ __device__
-    size_type size() const
-    {
-      return 1;
-    }
-
-    // XXX think of a better name for this
-    __host__ __device__
-    size_type n() const
-    {
-      return m_n;
-    }
-
-  private:
-    Size m_n;
-};
-
-
-template<typename Size>
-__host__ __device__
-trivial_decomposition<Size> make_trivial_decomposition(Size n)
-{
-  return trivial_decomposition<Size>(n);
-}
-
-
-template<typename Size>
-class blocked_decomposition
-{
-  public:
-    typedef Size size_type;
-
-    typedef thrust::pair<size_type,size_type> range;
-
-    __host__ __device__
-    blocked_decomposition()
-      : m_n(0),
-        m_block_size(0),
-        m_num_partitions(0)
-    {}
-
-    __host__ __device__
-    blocked_decomposition(size_type n, Size block_size)
-      : m_n(n),
-        m_block_size(block_size),
-        m_num_partitions((n + block_size - 1) / block_size)
-    {}
-
-    __host__ __device__
-    range operator[](size_type i) const
-    {
-      size_type first = i * m_block_size;
-      size_type last  = thrust::min(m_n, first + m_block_size);
-
-      return range(first, last);
-    }
-
-    __host__ __device__
-    size_type size() const
-    {
-      return m_num_partitions;
-    }
-
-    // XXX think of a better name for this
-    __host__ __device__
-    size_type n() const
-    {
-      return m_n;
-    }
-
-  private:
-    Size m_n;
-    Size m_block_size;
-    Size m_num_partitions;
-};
-
-
-template<typename Size>
-__host__ __device__
-blocked_decomposition<Size> make_blocked_decomposition(Size n, Size block_size)
-{
-  return blocked_decomposition<Size>(n,block_size);
-}
-
-
-template<typename Size>
-class uniform_decomposition
-  : public blocked_decomposition<Size>
-{
-  private:
-    typedef blocked_decomposition<Size> super_t;
-
-  public:
-    __host__ __device__
-    uniform_decomposition()
-      : super_t()
-    {}
-
-    __host__ __device__
-    uniform_decomposition(Size n, Size num_partitions)
-      : super_t(n, n / num_partitions)
-    {}
-};
-
-
-template<typename Size>
-__host__ __device__
-uniform_decomposition<Size> make_uniform_decomposition(Size n, Size num_partitions)
-{
-  return uniform_decomposition<Size>(n,num_partitions);
-}
-
-
-template<typename Size>
-class aligned_decomposition
-{
-  public:
-    typedef Size size_type;
-
-    typedef thrust::pair<size_type,size_type> range;
-
-    __host__ __device__
-    aligned_decomposition()
-      : m_n(0),
-        m_num_partitions(0),
-        m_tile_size(0)
-    {}
-
-    __host__ __device__
-    aligned_decomposition(Size n, Size num_partitions, Size aligned_size)
-      : m_n(n),
-        m_num_partitions(num_partitions),
-        m_tile_size(aligned_size)
-    {
-      size_type num_tiles = (n + m_tile_size - 1) / m_tile_size;
-
-      m_num_tiles_per_partition = num_tiles / size();
-      m_last_partial_tile_size  =  num_tiles % size();
-    }
-
-    __host__ __device__
-    range operator[](Size i) const
-    {
-      range result = range_in_tiles(i);
-      result.first *= m_tile_size;
-      result.second = thrust::min<size_type>(m_n, result.second * m_tile_size);
-      return result;
-    }
-
-    __host__ __device__
-    size_type size() const
-    {
-      return m_num_partitions;
-    }
-
-    // XXX think of a better name for this
-    __host__ __device__
-    size_type n() const
-    {
-      return m_n;
-    }
-
-  private:
-    __host__ __device__
-    range range_in_tiles(size_type i) const
-    {
-      range result;
-
-      result.first = m_num_tiles_per_partition * i;
-      result.first += thrust::min<size_type>(i, m_last_partial_tile_size);
-
-      result.second = result.first + m_num_tiles_per_partition + (i < m_last_partial_tile_size);
-
-      return result;
-    }
-
-    size_type m_n;
-    size_type m_num_partitions;
-    size_type m_num_tiles_per_partition;
-    size_type m_tile_size;
-    size_type m_last_partial_tile_size;
-};
-
-
-template<typename Size>
-__host__ __device__
-aligned_decomposition<Size> make_aligned_decomposition(Size n, Size num_partitions, Size aligned_size)
-{
-  return aligned_decomposition<Size>(n,num_partitions,aligned_size);
-}
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/alignment.h b/thrust/system/cuda/detail/detail/alignment.h
deleted file mode 100644
index 3ba76a59a..000000000
--- a/thrust/system/cuda/detail/detail/alignment.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace alignment_of_detail
-{
-
-
-template<typename T> class alignment_of_impl;
-
-template<typename T, std::size_t size_diff>
-  struct helper
-{
-  static const std::size_t value = size_diff;
-};
-
-template<typename T>
-  class helper<T,0>
-{
-  public:
-    static const std::size_t value = alignment_of_impl<T>::value;
-};
-
-template<typename T>
-  class alignment_of_impl
-{
-  private:
-    struct big { T x; char c; };
-
-  public:
-    static const std::size_t value = helper<big, sizeof(big) - sizeof(T)>::value;
-};
-
-
-} // end alignment_of_detail
-
-
-template<typename T>
-  struct alignment_of
-    : alignment_of_detail::alignment_of_impl<T>
-{};
-
-
-template<std::size_t Align> struct aligned_type;
-
-// __align__ is CUDA-specific, so guard it
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-// implementing aligned_type portably is tricky:
-
-#  if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
-// implement aligned_type with specialization because MSVC
-// requires literals as arguments to declspec(align(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-template<> struct aligned_type<256>
-{
-  struct __align__(256) type { };
-};
-
-template<> struct aligned_type<512>
-{
-  struct __align__(512) type { };
-};
-
-template<> struct aligned_type<1024>
-{
-  struct __align__(1024) type { };
-};
-
-template<> struct aligned_type<2048>
-{
-  struct __align__(2048) type { };
-};
-
-template<> struct aligned_type<4096>
-{
-  struct __align__(4096) type { };
-};
-
-template<> struct aligned_type<8192>
-{
-  struct __align__(8192) type { };
-};
-#  elif (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC) && (THRUST_GCC_VERSION < 40300)
-// implement aligned_type with specialization because gcc 4.2
-// requires literals as arguments to __attribute__(aligned(n))
-template<> struct aligned_type<1>
-{
-  struct __align__(1) type { };
-};
-
-template<> struct aligned_type<2>
-{
-  struct __align__(2) type { };
-};
-
-template<> struct aligned_type<4>
-{
-  struct __align__(4) type { };
-};
-
-template<> struct aligned_type<8>
-{
-  struct __align__(8) type { };
-};
-
-template<> struct aligned_type<16>
-{
-  struct __align__(16) type { };
-};
-
-template<> struct aligned_type<32>
-{
-  struct __align__(32) type { };
-};
-
-template<> struct aligned_type<64>
-{
-  struct __align__(64) type { };
-};
-
-template<> struct aligned_type<128>
-{
-  struct __align__(128) type { };
-};
-
-#  else
-// assume the compiler allows template parameters as
-// arguments to __align__ 
-template<std::size_t Align> struct aligned_type
-{
-  struct __align__(Align) type { };
-};
-#  endif // THRUST_HOST_COMPILER
-#else
-template<std::size_t Align> struct aligned_type
-{
-  struct type { };
-};
-#endif // THRUST_DEVICE_COMPILER
-
-
-template<std::size_t Len, std::size_t Align>
-  struct aligned_storage
-{
-  union type
-  {
-    unsigned char data[Len];
-
-    typename aligned_type<Align>::type align;
-  };
-};
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/balanced_path.h b/thrust/system/cuda/detail/detail/balanced_path.h
deleted file mode 100644
index 16d640205..000000000
--- a/thrust/system/cuda/detail/detail/balanced_path.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/pair.h>
-#include <thrust/detail/minmax.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace balanced_path_detail
-{
-
-template<bool UpperBound, typename IntT, typename It, typename T, typename Comp>
-__host__ __device__ void BinarySearchIteration(It data, int& begin, int& end,
-	T key, int shift, Comp comp) {
-
-	IntT scale = (1<< shift) - 1;
-	int mid = (int)((begin + scale * end)>> shift);
-
-	T key2 = data[mid];
-	bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
-	if(pred) begin = (int)mid + 1;
-	else end = mid;
-}
-
-template<bool UpperBound, typename T, typename It, typename Comp>
-__host__ __device__ int BinarySearch(It data, int count, T key, Comp comp) {
-	int begin = 0;
-	int end = count;
-	while(begin < end) 
-		BinarySearchIteration<UpperBound, int>(data, begin, end, key, 1, comp);
-	return begin;
-}
-
-template<bool UpperBound, typename IntT, typename T, typename It, typename Comp>
-__host__ __device__ int BiasedBinarySearch(It data, int count, T key, 
-	IntT levels, Comp comp) {
-	int begin = 0;
-	int end = count;
-
-	if(levels >= 4 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
-	if(levels >= 3 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 7, comp);
-	if(levels >= 2 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 5, comp);
-	if(levels >= 1 && begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 4, comp);
-
-	while(begin < end)
-		BinarySearchIteration<UpperBound, IntT>(data, begin, end, key, 1, comp);
-	return begin;
-}
-
-template<bool UpperBound, typename It1, typename It2, typename Comp>
-__host__ __device__ int MergePath(It1 a, int aCount, It2 b, int bCount, int diag, Comp comp)
-{
-  typedef typename thrust::iterator_traits<It1>::value_type T;
-  
-  int begin = thrust::max(0, diag - bCount);
-  int end   = thrust::min(diag, aCount);
-  
-  while(begin < end) 
-  {
-    int mid = (begin + end)>> 1;
-    T aKey = a[mid];
-    T bKey = b[diag - 1 - mid];
-    bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
-    if(pred) begin = mid + 1;
-    else end = mid;
-  }
-  return begin;
-}
-
-
-} // end namespace balanced_path_detail
-
-
-template<typename RandomAccessIterator1, typename Size1, typename RandomAccessIterator2, typename Size2, typename Compare>
-__host__ __device__
-thrust::pair<Size1,Size1>
-  balanced_path(RandomAccessIterator1 first1, Size1 n1,
-                RandomAccessIterator2 first2, Size1 n2,
-                Size1 diag,
-                Size2 levels,
-                Compare comp)
-{
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type T;
-
-  Size1 aIndex = balanced_path_detail::MergePath<false>(first1, n1, first2, n2, diag, comp);
-  Size1 bIndex = diag - aIndex;
-  
-  bool star = false;
-  if(bIndex < n2)
-  {
-    T x = first2[bIndex];
-    
-    // Search for the beginning of the duplicate run in both A and B.
-    Size1 aStart = balanced_path_detail::BiasedBinarySearch<false>(first1, aIndex, x, levels, comp);
-    Size1 bStart = balanced_path_detail::BiasedBinarySearch<false>(first2, bIndex, x, levels, comp);
-    
-    // The distance between x's merge path and its lower_bound is its rank.
-    // We add up the a and b ranks and evenly distribute them to
-    // get a stairstep path.
-    Size1 aRun = aIndex - aStart;
-    Size1 bRun = bIndex - bStart;
-    Size1 xCount = aRun + bRun;
-    
-    // Attempt to advance b and regress a.
-    Size1 bAdvance = thrust::max(xCount >> 1, xCount - aRun);
-    Size1 bEnd     = thrust::min<Size1>(n2, bStart + bAdvance + 1);
-    Size1 bRunEnd  = balanced_path_detail::BinarySearch<true>(first2 + bIndex, bEnd - bIndex, x, comp) + bIndex;
-    bRun = bRunEnd - bStart;
-    
-    bAdvance = thrust::min(bAdvance, bRun);
-    Size1 aAdvance = xCount - bAdvance;
-    
-    bool roundUp = (aAdvance == bAdvance + 1) && (bAdvance < bRun);
-    aIndex = aStart + aAdvance;
-    
-    if(roundUp) star = true;
-  }
-
-  return thrust::make_pair(aIndex, (diag - aIndex) + star);
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/detail/cached_temporary_allocator.h b/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
deleted file mode 100644
index 573ab4bcc..000000000
--- a/thrust/system/cuda/detail/detail/cached_temporary_allocator.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/detail/allocator/allocator_traits.h>
-#include <thrust/detail/allocator/temporary_allocator.h>
-#include <thrust/pair.h>
-#include <map>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy, template<typename> class BasePolicy>
-  class cached_temporary_allocator
-    : public BasePolicy<cached_temporary_allocator<DerivedPolicy,BasePolicy> >
-{
-  private:
-    typedef thrust::detail::temporary_allocator<char,DerivedPolicy> base_allocator_type;
-    typedef thrust::detail::allocator_traits<base_allocator_type>   traits;
-    typedef typename traits::pointer                                  allocator_pointer;
-    typedef std::multimap<std::ptrdiff_t, void*>                      free_blocks_type;
-    typedef std::map<void *, std::ptrdiff_t>                          allocated_blocks_type;
-
-    base_allocator_type   m_base_allocator;
-    free_blocks_type      free_blocks;
-    allocated_blocks_type allocated_blocks;
-
-    void free_all()
-    {
-      // deallocate all outstanding blocks in both lists
-      for(free_blocks_type::iterator i = free_blocks.begin();
-          i != free_blocks.end();
-          ++i)
-      {
-        // transform the pointer to allocator_pointer before calling deallocate
-        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->second)), i->first);
-      }
-
-      for(allocated_blocks_type::iterator i = allocated_blocks.begin();
-          i != allocated_blocks.end();
-          ++i)
-      {
-        // transform the pointer to allocator_pointer before calling deallocate
-        traits::deallocate(m_base_allocator, allocator_pointer(reinterpret_cast<char*>(i->first)), i->second);
-      }
-    }
-
-  public:
-    cached_temporary_allocator(thrust::execution_policy<DerivedPolicy> &system)
-      : m_base_allocator(system)
-    {}
-
-    ~cached_temporary_allocator()
-    {
-      // free all allocations when cached_allocator goes out of scope
-      free_all();
-    }
-
-    void *allocate(std::ptrdiff_t num_bytes)
-    {
-      void *result = 0;
-
-      // search the cache for a free block
-      free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
-
-      if(free_block != free_blocks.end())
-      {
-        // get the pointer
-        result = free_block->second;
-
-        // erase from the free_blocks map
-        free_blocks.erase(free_block);
-      }
-      else
-      {
-        // no allocation of the right size exists
-        // create a new one with m_base_allocator
-        // allocate memory and convert to raw pointer
-        result = thrust::raw_pointer_cast(traits::allocate(m_base_allocator, num_bytes));
-      }
-
-      // insert the allocated pointer into the allocated_blocks map
-      allocated_blocks.insert(std::make_pair(result, num_bytes));
-
-      return result;
-    }
-
-    void deallocate(void *ptr)
-    {
-      // erase the allocated block from the allocated blocks map
-      allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
-      std::ptrdiff_t num_bytes = iter->second;
-      allocated_blocks.erase(iter);
-
-      // insert the block into the free blocks map
-      free_blocks.insert(std::make_pair(num_bytes, ptr));
-    }
-};
-
-
-// overload get_temporary_buffer on cached_temporary_allocator
-// note that we take a reference to cached_temporary_allocator
-template<typename T, typename DerivedPolicy, template<typename> class BasePolicy>
-  thrust::pair<T*, std::ptrdiff_t>
-    get_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, std::ptrdiff_t n)
-{
-  // ask the allocator for sizeof(T) * n bytes
-  T* result = reinterpret_cast<T*>(alloc.allocate(sizeof(T) * n));
-
-  // return the pointer and the number of elements allocated
-  return thrust::make_pair(result,n);
-}
-
-
-// overload return_temporary_buffer on cached_temporary_allocator
-// an overloaded return_temporary_buffer should always accompany
-// an overloaded get_temporary_buffer
-template<typename Pointer, typename DerivedPolicy, template<typename> class BasePolicy>
-  void return_temporary_buffer(cached_temporary_allocator<DerivedPolicy,BasePolicy> &alloc, Pointer p)
-{
-  // return the pointer to the allocator
-  alloc.deallocate(thrust::raw_pointer_cast(p));
-}
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/launch_calculator.h b/thrust/system/cuda/detail/detail/launch_calculator.h
deleted file mode 100644
index 686b5d6c2..000000000
--- a/thrust/system/cuda/detail/detail/launch_calculator.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename Closure>
-class launch_calculator
-{
-  device_properties_t   properties;
-  function_attributes_t attributes;
-
-  public:
-  
-  __host__ __device__
-  launch_calculator();
-
-  __host__ __device__
-  launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes);
-
-  __host__ __device__
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(void) const;
-
-  template<typename UnaryFunction>
-  __host__ __device__
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size(UnaryFunction block_size_to_smem_size) const;
-  
-  __host__ __device__
-  thrust::tuple<size_t,size_t,size_t> with_variable_block_size_available_smem(void) const;
-
-  private:
-
-  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
-   *  where num_threads_per_block is a valid block size for an instance of Closure
-   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
-   *  number of such blocks that can execute on a streaming multiprocessor at once.
-   */
-  __host__ __device__
-  thrust::pair<size_t, size_t> default_block_configuration() const;
-
-  /*! Returns a pair (num_threads_per_block, num_blocks_per_multiprocessor)
-   *  where num_threads_per_block is a valid block size for an instance of Closure
-   *  chosen by a heuristic and num_blocks_per_multiprocessor is the maximum
-   *  number of such blocks that can execute on a streaming multiprocessor at once.
-   *
-   *  \param block_size_to_smem_size Mapping from num_threads_per_block to number of
-   *                                 dynamically-allocated bytes of shared memory
-   */
-  template<typename UnaryFunction>
-  __host__ __device__
-  thrust::pair<size_t, size_t> default_block_configuration(UnaryFunction block_size_to_smem_size) const;
-};
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/launch_calculator.inl>
-
diff --git a/thrust/system/cuda/detail/detail/launch_calculator.inl b/thrust/system/cuda/detail/detail/launch_calculator.inl
deleted file mode 100644
index 3fd77d4f2..000000000
--- a/thrust/system/cuda/detail/detail/launch_calculator.inl
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-// do not attempt to compile this file with any other compiler
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename Closure>
-__host__ __device__
-launch_calculator<Closure>::launch_calculator(void)
-  : properties(device_properties()),
-    attributes(closure_attributes<Closure>())
-{}
-  
-template<typename Closure>
-__host__ __device__
-launch_calculator<Closure>::launch_calculator(const device_properties_t& properties, const function_attributes_t& attributes)
-  : properties(properties),
-    attributes(attributes)
-{}
-
-template<typename Closure>
-  template<typename UnaryFunction>
-__host__ __device__
-thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(UnaryFunction block_size_to_smem_size) const
-{
-  // choose a block size
-  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties, block_size_to_smem_size);
-
-  // choose a subscription rate
-  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
-
-  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
-}
-
-
-template<typename Closure>
-__host__ __device__
-thrust::pair<size_t, size_t> launch_calculator<Closure>::default_block_configuration(void) const
-{
-  // choose a block size
-  std::size_t num_threads_per_block = block_size_with_maximum_potential_occupancy(attributes, properties);
-
-  // choose a subscription rate
-  std::size_t num_blocks_per_multiprocessor = properties.maxThreadsPerMultiProcessor / num_threads_per_block;
-
-  return thrust::make_pair(num_threads_per_block, num_blocks_per_multiprocessor);
-}
-
-template<typename Closure>
-__host__ __device__
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(void) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration();
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, 0);
-}
-
-template <typename Closure>
-  template <typename UnaryFunction>
-__host__ __device__
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size(UnaryFunction block_size_to_smem_size) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration(block_size_to_smem_size);
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, block_size_to_smem_size(config.first));
-}
-  
-template<typename Closure>
-__host__ __device__
-thrust::tuple<size_t,size_t,size_t> launch_calculator<Closure>::with_variable_block_size_available_smem(void) const
-{
-  thrust::pair<size_t, size_t> config = default_block_configuration();
-  size_t smem_per_block = proportional_smem_allocation(properties, attributes, config.second);
-  return thrust::tuple<size_t,size_t,size_t>(config.second * properties.multiProcessorCount, config.first, smem_per_block);
-}
-
-} // end detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
diff --git a/thrust/system/cuda/detail/detail/launch_closure.h b/thrust/system/cuda/detail/detail/launch_closure.h
deleted file mode 100644
index 5c8ec4b07..000000000
--- a/thrust/system/cuda/detail/detail/launch_closure.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<unsigned int _ThreadsPerBlock = 0,
-         unsigned int _BlocksPerMultiprocessor = 0>
-struct launch_bounds
-{
-  typedef thrust::detail::integral_constant<unsigned int, _ThreadsPerBlock>         ThreadsPerBlock;
-  typedef thrust::detail::integral_constant<unsigned int, _BlocksPerMultiprocessor> BlocksPerMultiprocessor;
-};
-
-
-struct thread_array : public launch_bounds<>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return threadIdx.x; }
-  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return blockDim.x * gridDim.x; } 
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int thread_count(void) const { return 0; } 
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-
-struct blocked_thread_array : public launch_bounds<>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return blockDim.x;  } 
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;  }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;   }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
-  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ void         barrier(void)               {           }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-
-template <unsigned int _ThreadsPerBlock>
-struct statically_blocked_thread_array : public launch_bounds<_ThreadsPerBlock,1>
-{
-// CUDA built-in variables require nvcc
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return threadIdx.x;      }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return _ThreadsPerBlock; } // minor optimization
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return blockIdx.x;       }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return gridDim.x;        }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return block_dimension() * block_index() + thread_index(); }
-  __device__ __thrust_forceinline__ void         barrier(void)               { __syncthreads();    }
-#else
-  __device__ __thrust_forceinline__ unsigned int thread_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_dimension(void) const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int block_index(void)     const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int grid_dimension(void)  const { return 0; }
-  __device__ __thrust_forceinline__ unsigned int linear_index(void)    const { return 0; }
-  __device__ __thrust_forceinline__ void         barrier(void)               {           }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-};
-
-template<typename DerivedPolicy, typename Closure, typename Size>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size num_blocks);
-
-template<typename DerivedPolicy, typename Closure, typename Size1, typename Size2>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size);
-
-template<typename DerivedPolicy, typename Closure, typename Size1, typename Size2, typename Size3>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size);
-
-/*! Returns a copy of the cudaFuncAttributes structure
- *  that is associated with a given Closure
- */
-template<typename Closure>
-__host__ __device__
-function_attributes_t closure_attributes(void);
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/launch_closure.inl>
-
diff --git a/thrust/system/cuda/detail/detail/launch_closure.inl b/thrust/system/cuda/detail/detail/launch_closure.inl
deleted file mode 100644
index ffba1b258..000000000
--- a/thrust/system/cuda/detail/detail/launch_closure.inl
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-#include <thrust/system/cuda/detail/detail/alignment.h>
-#include <thrust/system/cuda/detail/bulk.h>
-
-namespace thrust
-{
-namespace detail
-{
-
-// XXX WAR circular inclusion problems with this forward declaration
-template<typename, typename> class temporary_array;
-
-} // end detail
-
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-template<typename Closure>
-__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
-void launch_closure_by_value(Closure f)
-{
-  f();
-}
-
-template<typename Closure>
-__global__ __launch_bounds__(Closure::context_type::ThreadsPerBlock::value, Closure::context_type::BlocksPerMultiprocessor::value)
-void launch_closure_by_pointer(const Closure *f)
-{
-  // copy to registers
-  Closure f_reg = *f;
-  f_reg();
-}
-#else
-template<typename Closure>
-void launch_closure_by_value(Closure) {}
-
-template<typename Closure>
-void launch_closure_by_pointer(const Closure *) {}
-
-#endif // THRUST_DEVICE_COMPILER_NVCC
-
-template<typename Closure,
-         bool launch_by_value = sizeof(Closure) <= 256>
-  struct closure_launcher_base
-{
-  typedef void (*launch_function_t)(Closure); 
- 
-  __host__ __device__
-  static launch_function_t get_launch_function()
-  {
-    return launch_closure_by_value<Closure>;
-  }
-
-  template<typename DerivedPolicy, typename Size1, typename Size2, typename Size3>
-  __host__ __device__
-  static void launch(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-    // this ensures that the kernel gets instantiated identically for all values of __CUDA_ARCH__
-    launch_function_t kernel = get_launch_function();
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#if __BULK_HAS_CUDART__
-    if(num_blocks > 0)
-    {
-#ifndef __CUDA_ARCH__
-      kernel<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size, stream(thrust::detail::derived_cast(exec))>>>(f);
-#else
-      // XXX we can't pass parameters with constructors to kernels launched through the triple chevrons in __device__ code
-      //     use cudaLaunchDevice directly
-      void *param_buffer = cudaGetParameterBuffer(alignment_of<Closure>::value, sizeof(Closure));
-      std::memcpy(param_buffer, &f, sizeof(Closure));
-      cudaLaunchDevice(reinterpret_cast<void*>(kernel), param_buffer, dim3(num_blocks), dim3(block_size), smem_size, stream(thrust::detail::derived_cast(exec)));
-#endif // __CUDA_ARCH__
-      synchronize_if_enabled("launch_closure_by_value");
-    }
-#endif // __BULK_HAS_CUDART__
-#endif // THRUST_DEVICE_COMPILER_NVCC
-  }
-}; // end closure_launcher_base
-
-
-template<typename Closure>
-  struct closure_launcher_base<Closure,false>
-{
-  typedef void (*launch_function_t)(const Closure *); 
- 
-  __host__ __device__
-  static launch_function_t get_launch_function(void)
-  {
-    return launch_closure_by_pointer<Closure>;
-  }
-
-  template<typename DerivedPolicy, typename Size1, typename Size2, typename Size3>
-  __host__ __device__
-  static void launch(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-    // this ensures that the kernel gets instantiated identically for all values of __CUDA_ARCH__
-    launch_function_t kernel = get_launch_function();
-
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-#if __BULK_HAS_CUDART__
-    if(num_blocks > 0)
-    {
-      // use temporary storage for the closure
-      thrust::host_system_tag host_tag;
-      thrust::detail::temporary_array<Closure,DerivedPolicy> closure_storage(exec, host_tag, &f, &f + 1);
-
-      // launch
-      kernel<<<(unsigned int) num_blocks, (unsigned int) block_size, (unsigned int) smem_size, stream(thrust::detail::derived_cast(exec))>>>((&closure_storage[0]).get());
-      synchronize_if_enabled("launch_closure_by_pointer");
-    }
-#endif // __BULK_HAS_CUDART__
-#endif // THRUST_DEVICE_COMPILER_NVCC
-  }
-};
-
-
-template<typename Closure>
-  struct closure_launcher
-    : public closure_launcher_base<Closure>
-{
-  typedef closure_launcher_base<Closure> super_t;
-  
-  __host__ __device__
-  static inline const device_properties_t& device_properties(void)
-  {
-    return device_properties();
-  }
-  
-  __host__ __device__
-  static inline function_attributes_t function_attributes(void)
-  {
-    return thrust::system::cuda::detail::function_attributes(super_t::get_launch_function());
-  }
-
-  template<typename DerivedPolicy, typename Size1, typename Size2, typename Size3>
-  __host__ __device__
-  static void launch(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-  {
-    super_t::launch(exec,f,num_blocks,block_size,smem_size);
-  }
-};
-
-template<typename DerivedPolicy, typename Closure, typename Size>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size num_blocks)
-{
-  launch_calculator<Closure> calculator;
-  launch_closure(exec, f, num_blocks, thrust::get<1>(calculator.with_variable_block_size()));
-} // end launch_closure()
-
-template<typename DerivedPolicy, typename Closure, typename Size1, typename Size2>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size)
-{
-  launch_closure(exec, f, num_blocks, block_size, 0u);
-} // end launch_closure()
-
-template<typename DerivedPolicy, typename Closure, typename Size1, typename Size2, typename Size3>
-__host__ __device__
-void launch_closure(execution_policy<DerivedPolicy> &exec, Closure f, Size1 num_blocks, Size2 block_size, Size3 smem_size)
-{
-  closure_launcher<Closure>::launch(exec, f, num_blocks, block_size, smem_size);
-} // end launch_closure()
-
-
-namespace closure_attributes_detail
-{
-
-
-template<typename Closure>
-inline __host__ __device__
-function_attributes_t uncached_closure_attributes()
-{
-  typedef closure_launcher<Closure> Launcher;
-  return thrust::system::cuda::detail::function_attributes(Launcher::get_launch_function());
-}
-
-
-template<typename Closure>
-function_attributes_t cached_closure_attributes()
-{
-  // cache the result of function_attributes(), because it is slow
-  // only cache the first few devices
-  static const int max_num_devices                                  = 16;
-
-  static bool attributes_exist[max_num_devices]                     = {0};
-  static function_attributes_t function_attributes[max_num_devices] = {};
-
-  // XXX device_id ought to be an argument to this function
-  int device_id = current_device();
-
-  if(device_id >= max_num_devices)
-  {
-    return uncached_closure_attributes<Closure>();
-  }
-
-  if(!attributes_exist[device_id])
-  {
-    function_attributes[device_id] = uncached_closure_attributes<Closure>();
-
-    // disallow the compiler to move the write to attributes_exist[device_id]
-    // before the initialization of function_attributes[device_id]
-    __thrust_compiler_fence();
-
-    attributes_exist[device_id] = true;
-  }
-
-  return function_attributes[device_id];
-}
-
-
-} // end closure_attributes_detail
-
-  
-template<typename Closure>
-__host__ __device__
-function_attributes_t closure_attributes()
-{
-#ifndef __CUDA_ARCH__
-  return closure_attributes_detail::cached_closure_attributes<Closure>();
-#else
-  return closure_attributes_detail::uncached_closure_attributes<Closure>();
-#endif
-}
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/detail/merge.h b/thrust/system/cuda/detail/detail/merge.h
deleted file mode 100644
index a72959e2a..000000000
--- a/thrust/system/cuda/detail/detail/merge.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/iterator/iterator_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-// sequential merge for when we have a static bound on the size of the result
-template<unsigned int result_size_bound, typename Iterator1, typename Iterator2, typename Iterator3, typename Compare>
-__device__
-void sequential_bounded_merge(Iterator1 first1, Iterator1 last1,
-                              Iterator2 first2, Iterator2 last2,
-                              Iterator3 result,
-                              Compare comp)
-{ 
-  // XXX nvcc generates the wrong code for the path below for sm_1x
-  //     so use this (slower) but equivalent implementation which does not prefetch
-#if __CUDA_ARCH__ < 200
-  for(unsigned int i = 0; i < result_size_bound; ++i, ++result)
-  {
-    bool p = (first2 >= last2) || ((first1 < last1) && !comp(*first2, *first1));
-    
-    *result = p ? *first1 : *first2;
-    
-    if(p)
-    {
-      ++first1;
-    }
-    else
-    {
-      ++first2;
-    }
-  }
-#else
-  typename thrust::iterator_value<Iterator1>::type aKey = *first1;
-  typename thrust::iterator_value<Iterator2>::type bKey = *first2;
-  
-  for(unsigned int i = 0; i < result_size_bound; ++i, ++result)
-  {
-    bool p = (first2 >= last2) || ((first1 < last1) && !comp(bKey, aKey));
-    
-    *result = p ? aKey : bKey;
-    
-    if(p)
-    {
-      ++first1;
-      aKey = *first1;
-    }
-    else
-    {
-      ++first2;
-      bKey = *first2;
-    }
-  }
-#endif
-}
-
-
-template<typename Size, typename Iterator1, typename Iterator2, typename Compare>
-__device__
-Size merge_path(Size pos, Iterator1 first1, Size n1, Iterator2 first2, Size n2, Compare comp)
-{
-  Size begin = (pos >= n2) ? (pos - n2) : Size(0);
-  Size end = thrust::min<Size>(pos, n1);
-  
-  while(begin < end)
-  {
-    Size mid = (begin + end) >> 1;
-
-    if(comp(first2[pos - 1 - mid], first1[mid]))
-    {
-      end = mid;
-    }
-    else
-    {
-      begin = mid + 1;
-    }
-  }
-  return begin;
-}
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/set_operation.h b/thrust/system/cuda/detail/detail/set_operation.h
deleted file mode 100644
index 940498677..000000000
--- a/thrust/system/cuda/detail/detail/set_operation.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename Compare,
-         typename SetOperation>
-__host__ __device__
-RandomAccessIterator3 set_operation(execution_policy<DerivedPolicy> &exec,
-                                    RandomAccessIterator1 first1,
-                                    RandomAccessIterator1 last1,
-                                    RandomAccessIterator2 first2,
-                                    RandomAccessIterator2 last2,
-                                    RandomAccessIterator3 result,
-                                    Compare comp,
-                                    SetOperation set_op);
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/detail/set_operation.inl>
-
diff --git a/thrust/system/cuda/detail/detail/set_operation.inl b/thrust/system/cuda/detail/detail/set_operation.inl
deleted file mode 100644
index 5c1d2da9b..000000000
--- a/thrust/system/cuda/detail/detail/set_operation.inl
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/detail/balanced_path.h>
-#include <thrust/system/cuda/detail/block/inclusive_scan.h>
-#include <thrust/system/cuda/detail/block/exclusive_scan.h>
-#include <thrust/system/cuda/detail/block/copy.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/uninitialized.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-#include <thrust/scan.h>
-#include <thrust/pair.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/detail/minmax.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace set_operation_detail
-{
-
-
-using thrust::system::cuda::detail::detail::statically_blocked_thread_array;
-using thrust::detail::uint16_t;
-using thrust::detail::uint32_t;
-
-
-// empirically determined on sm_20
-// value_types larger than this will fail to launch if placed in smem
-template<typename T>
-  struct stage_through_smem
-{
-  static const bool value = sizeof(T) <= 6 * sizeof(uint32_t);
-};
-
-
-// max_input_size <= 32
-template<typename Size, typename InputIterator, typename OutputIterator>
-inline __device__
-  OutputIterator serial_bounded_copy_if(Size max_input_size,
-                                        InputIterator first,
-                                        uint32_t mask,
-                                        OutputIterator result)
-{
-  for(Size i = 0; i < max_input_size; ++i, ++first)
-  {
-    if((1<<i) & mask)
-    {
-      *result = *first;
-      ++result;
-    }
-  }
-
-  return result;
-}
-
-
-template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  struct find_partition_offsets_functor
-{
-  Size partition_size;
-  InputIterator1 first1;
-  InputIterator2 first2;
-  Size n1, n2;
-  Compare comp;
-
-  __host__ __device__
-  find_partition_offsets_functor(Size partition_size,
-                                 InputIterator1 first1, InputIterator1 last1,
-                                 InputIterator2 first2, InputIterator2 last2,
-                                 Compare comp)
-    : partition_size(partition_size),
-      first1(first1), first2(first2),
-      n1(last1 - first1), n2(last2 - first2),
-      comp(comp)
-  {}
-
-  inline __host__ __device__
-  thrust::pair<Size,Size> operator()(Size i) const
-  {
-    Size diag = thrust::min(n1 + n2, i * partition_size);
-
-    // XXX the correctness of balanced_path depends critically on the ll suffix below
-    //     why???
-    return balanced_path(first1, n1, first2, n2, diag, 4ll, comp);
-  }
-};
-
-
-template<typename Size, typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-__host__ __device__
-OutputIterator find_partition_offsets(thrust::cuda::execution_policy<DerivedPolicy> &exec,
-                                      Size num_partitions,
-                                      Size partition_size,
-                                      InputIterator1 first1, InputIterator1 last1,
-                                      InputIterator2 first2, InputIterator2 last2,
-                                      OutputIterator result,
-                                      Compare comp)
-{
-  find_partition_offsets_functor<Size,InputIterator1,InputIterator2,Compare> f(partition_size, first1, last1, first2, last2, comp);
-
-  return thrust::transform(exec,
-                           thrust::counting_iterator<Size>(0),
-                           thrust::counting_iterator<Size>(num_partitions),
-                           result,
-                           f);
-}
-
-
-namespace block
-{
-
-
-template<unsigned int block_size, typename T>
-inline __device__
-T right_neighbor(statically_blocked_thread_array<block_size> &ctx, const T &x, const T &boundary)
-{
-  // stage this shift to conserve smem
-  const unsigned int storage_size = block_size / 2;
-  __shared__ uninitialized_array<T,storage_size> shared;
-
-  T result = x;
-
-  unsigned int tid = ctx.thread_index();
-
-  if(0 < tid && tid <= storage_size)
-  {
-    shared[tid - 1] = x;
-  }
-
-  ctx.barrier();
-
-  if(tid < storage_size)
-  {
-    result = shared[tid];
-  }
-
-  ctx.barrier();
-  
-  tid -= storage_size;
-  if(0 < tid && tid <= storage_size)
-  {
-    shared[tid - 1] = x;
-  }
-  else if(tid == 0)
-  {
-    shared[storage_size-1] = boundary;
-  }
-
-  ctx.barrier();
-
-  if(tid < storage_size)
-  {
-    result = shared[tid];
-  }
-
-  ctx.barrier();
-
-  return result;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
-inline __device__
-  unsigned int bounded_count_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
-                                             InputIterator1 first1, uint16_t n1,
-                                             InputIterator2 first2, uint16_t n2,
-                                             Compare comp,
-                                             SetOperation set_op)
-{
-  unsigned int thread_idx = ctx.thread_index();
-
-  // find partition offsets
-  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
-  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
-  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
-
-  __shared__ uint16_t s_thread_output_size[block_size];
-
-  // work_per_thread + 1 to accomodate a "starred" partition returned from balanced_path above
-  s_thread_output_size[thread_idx] =
-    set_op.count(work_per_thread + 1,
-                 first1 + thread_input_begin.first,  first1 + thread_input_end.first,
-                 first2 + thread_input_begin.second, first2 + thread_input_end.second,
-                 comp);
-
-  ctx.barrier();
-
-  // reduce per-thread counts
-  thrust::system::cuda::detail::block::inplace_inclusive_scan(ctx, s_thread_output_size);
-  return s_thread_output_size[ctx.block_dimension() - 1];
-}
-
-
-inline __device__ int pop_count(unsigned int x)
-{
-// guard use of __popc from other compilers
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  return __popc(x);
-#else
-  return x;
-#endif
-}
-
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-  OutputIterator bounded_set_operation_n(statically_blocked_thread_array<block_size> &ctx,
-                                         InputIterator1 first1, uint16_t n1,
-                                         InputIterator2 first2, uint16_t n2,
-                                         OutputIterator result,
-                                         Compare comp,
-                                         SetOperation set_op)
-{
-  unsigned int thread_idx = ctx.thread_index();
-  
-  // find partition offsets
-  uint16_t diag = thrust::min<uint16_t>(n1 + n2, thread_idx * work_per_thread);
-  thrust::pair<uint16_t,uint16_t> thread_input_begin = balanced_path(first1, n1, first2, n2, diag, 2, comp);
-  thrust::pair<uint16_t,uint16_t> thread_input_end   = block::right_neighbor<block_size>(ctx, thread_input_begin, thrust::make_pair(n1,n2));
-
-  typedef typename thrust::iterator_value<InputIterator1>::type value_type;
-  // +1 to accomodate a "starred" partition returned from balanced_path above
-  uninitialized_array<value_type, work_per_thread + 1> sparse_result;
-  uint32_t active_mask =
-    set_op(work_per_thread + 1,
-           first1 + thread_input_begin.first,  first1 + thread_input_end.first,
-           first2 + thread_input_begin.second, first2 + thread_input_end.second,
-           sparse_result.begin(),
-           comp);
-
-  __shared__ uint16_t s_thread_output_size[block_size];
-  s_thread_output_size[thread_idx] = pop_count(active_mask);
-
-  ctx.barrier();
-
-  // scan to turn per-thread counts into output indices
-  uint16_t block_output_size = thrust::system::cuda::detail::block::inplace_exclusive_scan(ctx, s_thread_output_size, 0u);
-
-  serial_bounded_copy_if(work_per_thread + 1, sparse_result.begin(), active_mask, result + s_thread_output_size[thread_idx]);
-
-  ctx.barrier();
-
-  return result + block_output_size;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename Compare, typename SetOperation>
-inline __device__
-  typename thrust::iterator_difference<InputIterator1>::type
-    count_set_operation(statically_blocked_thread_array<block_size> &ctx,
-                        InputIterator1 first1, InputIterator1 last1,
-                        InputIterator2 first2, InputIterator2 last2,
-                        Compare comp,
-                        SetOperation set_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  difference result = 0;
-
-  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
-
-  // iterate until the input is consumed
-  while(remaining_input_size.first + remaining_input_size.second > 0)
-  {
-    // find the end of this subpartition's input
-    // -1 to accomodate "starred" partitions
-    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
-    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
-    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
-  
-    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
-    if(stage_through_smem<value_type>::value)
-    {
-      // load the input into __shared__ storage
-      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
-  
-      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
-      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
-  
-      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                                 s_input.begin(), subpartition_size.first,
-                                                                                 s_input_end1,    subpartition_size.second,
-                                                                                 comp,
-                                                                                 set_op);
-    }
-    else
-    {
-      result += block::bounded_count_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                                 first1, subpartition_size.first,
-                                                                                 first2, subpartition_size.second,
-                                                                                 comp,
-                                                                                 set_op);
-    }
-
-    // advance input
-    first1 += subpartition_size.first;
-    first2 += subpartition_size.second;
-
-    // decrement remaining size
-    remaining_input_size.first  -= subpartition_size.first;
-    remaining_input_size.second -= subpartition_size.second;
-  }
-
-  return result;
-}
-
-
-template<uint16_t block_size, uint16_t work_per_thread, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-OutputIterator set_operation(statically_blocked_thread_array<block_size> &ctx,
-                             InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             OutputIterator result,
-                             Compare comp,
-                             SetOperation set_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  thrust::pair<difference,difference> remaining_input_size = thrust::make_pair(last1 - first1, last2 - first2);
-
-  // iterate until the input is consumed
-  while(remaining_input_size.first + remaining_input_size.second > 0)
-  {
-    // find the end of this subpartition's input
-    // -1 to accomodate "starred" partitions
-    uint16_t max_subpartition_size = block_size * work_per_thread - 1;
-    difference diag = thrust::min<difference>(remaining_input_size.first + remaining_input_size.second, max_subpartition_size);
-    thrust::pair<uint16_t,uint16_t> subpartition_size = balanced_path(first1, remaining_input_size.first, first2, remaining_input_size.second, diag, 4ll, comp);
-    
-    typedef typename thrust::iterator_value<InputIterator2>::type value_type;
-    if(stage_through_smem<value_type>::value)
-    {
-      // load the input into __shared__ storage
-      __shared__ uninitialized_array<value_type, block_size * work_per_thread> s_input;
-  
-      value_type *s_input_end1 = thrust::system::cuda::detail::block::copy_n(ctx, first1, subpartition_size.first,  s_input.begin());
-      value_type *s_input_end2 = thrust::system::cuda::detail::block::copy_n(ctx, first2, subpartition_size.second, s_input_end1);
-  
-      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                          s_input.begin(), subpartition_size.first,
-                                                                          s_input_end1,    subpartition_size.second,
-                                                                          result,
-                                                                          comp,
-                                                                          set_op);
-    }
-    else
-    {
-      result = block::bounded_set_operation_n<block_size,work_per_thread>(ctx,
-                                                                          first1, subpartition_size.first,
-                                                                          first2, subpartition_size.second,
-                                                                          result,
-                                                                          comp,
-                                                                          set_op);
-    }
-  
-    // advance input
-    first1 += subpartition_size.first;
-    first2 += subpartition_size.second;
-
-    // decrement remaining size
-    remaining_input_size.first  -= subpartition_size.first;
-    remaining_input_size.second -= subpartition_size.second;
-  }
-
-  return result;
-}
-
-
-} // end namespace block
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-  inline __device__ void count_set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
-                                             InputIterator1                                      input_partition_offsets,
-                                             Size                                                num_partitions,
-                                             InputIterator2                                      first1,
-                                             InputIterator3                                      first2,
-                                             OutputIterator                                      result,
-                                             Compare                                             comp,
-                                             SetOperation                                        set_op)
-{
-  // consume partitions
-  for(Size partition_idx = ctx.block_index();
-      partition_idx < num_partitions;
-      partition_idx += ctx.grid_dimension())
-  {
-    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
-
-    // find the partition
-    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
-    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
-
-    // count the size of the set operation
-    difference count = block::count_set_operation<threads_per_block,work_per_thread>(ctx,
-                                                                                     first1 + block_input_begin.first,  first1 + block_input_end.first,
-                                                                                     first2 + block_input_begin.second, first2 + block_input_end.second,
-                                                                                     comp,
-                                                                                     set_op);
-
-    if(ctx.thread_index() == 0)
-    {
-      result[partition_idx] = count;
-    }
-  }
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-  struct count_set_operation_closure
-{
-  typedef statically_blocked_thread_array<threads_per_block> context_type;
-
-  InputIterator1 input_partition_offsets;
-  Size           num_partitions;
-  InputIterator2 first1;
-  InputIterator3 first2;
-  OutputIterator result;
-  Compare        comp;
-  SetOperation   set_op;
-
-  __host__ __device__
-  count_set_operation_closure(InputIterator1 input_partition_offsets,
-                              Size           num_partitions,
-                              InputIterator2 first1,
-                              InputIterator3 first2,
-                              OutputIterator result,
-                              Compare        comp,
-                              SetOperation   set_op)
-    : input_partition_offsets(input_partition_offsets),
-      num_partitions(num_partitions),
-      first1(first1),
-      first2(first2),
-      result(result),
-      comp(comp),
-      set_op(set_op)
-  {}
-
-  inline __device__ void operator()() const
-  {
-    context_type ctx;
-    count_set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, result, comp, set_op);
-  }
-};
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename OutputIterator, typename Compare, typename SetOperation>
-__host__ __device__
-  count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation>
-    make_count_set_operation_closure(InputIterator1 input_partition_offsets,
-                                     Size           num_partitions,
-                                     InputIterator2 first1,
-                                     InputIterator3 first2,
-                                     OutputIterator result,
-                                     Compare        comp,
-                                     SetOperation   set_op)
-{
-  typedef count_set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,OutputIterator,Compare,SetOperation> result_type;
-  return result_type(input_partition_offsets,num_partitions,first1,first2,result,comp,set_op);
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-inline __device__
-  void set_operation(statically_blocked_thread_array<threads_per_block> &ctx,
-                     InputIterator1                                      input_partition_offsets,
-                     Size                                                num_partitions,
-                     InputIterator2                                      first1,
-                     InputIterator3                                      first2,
-                     InputIterator4                                      output_partition_offsets,
-                     OutputIterator                                      result,
-                     Compare                                             comp,
-                     SetOperation                                        set_op)
-{
-  // consume partitions
-  for(Size partition_idx = ctx.block_index();
-      partition_idx < num_partitions;
-      partition_idx += ctx.grid_dimension())
-  {
-    typedef typename thrust::iterator_difference<InputIterator2>::type difference;
-
-    // find the partition
-    thrust::pair<difference,difference> block_input_begin = input_partition_offsets[partition_idx];
-    thrust::pair<difference,difference> block_input_end   = input_partition_offsets[partition_idx + 1];
-
-    // do the set operation across the partition
-    block::set_operation<threads_per_block,work_per_thread>(ctx,
-                                                            first1 + block_input_begin.first,  first1 + block_input_end.first,
-                                                            first2 + block_input_begin.second, first2 + block_input_end.second,
-                                                            result + output_partition_offsets[partition_idx],
-                                                            comp,
-                                                            set_op);
-  }
-}
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-  struct set_operation_closure
-{
-  typedef statically_blocked_thread_array<threads_per_block> context_type;
-
-  InputIterator1 input_partition_offsets;
-  Size           num_partitions;
-  InputIterator2 first1;
-  InputIterator3 first2;
-  InputIterator4 output_partition_offsets;
-  OutputIterator result;
-  Compare        comp;
-  SetOperation   set_op;
-
-  __host__ __device__
-  set_operation_closure(InputIterator1 input_partition_offsets,
-                        Size           num_partitions,
-                        InputIterator2 first1,
-                        InputIterator3 first2,
-                        InputIterator4 output_partition_offsets,
-                        OutputIterator result,
-                        Compare        comp,
-                        SetOperation   set_op)
-    : input_partition_offsets(input_partition_offsets),
-      num_partitions(num_partitions),
-      first1(first1),
-      first2(first2),
-      output_partition_offsets(output_partition_offsets),
-      result(result),
-      comp(comp),
-      set_op(set_op)
-  {}
-
-  inline __device__ void operator()() const
-  {
-    context_type ctx;
-    set_operation<threads_per_block,work_per_thread>(ctx, input_partition_offsets, num_partitions, first1, first2, output_partition_offsets, result, comp, set_op);
-  }
-};
-
-
-template<uint16_t threads_per_block, uint16_t work_per_thread, typename InputIterator1, typename Size, typename InputIterator2, typename InputIterator3, typename InputIterator4, typename OutputIterator, typename Compare, typename SetOperation>
-__host__ __device__
-  set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation>
-    make_set_operation_closure(InputIterator1 input_partition_offsets,
-                               Size           num_partitions,
-                               InputIterator2 first1,
-                               InputIterator3 first2,
-                               InputIterator4 output_partition_offsets,
-                               OutputIterator result,
-                               Compare        comp,
-                               SetOperation   set_op)
-{
-  typedef set_operation_closure<threads_per_block,work_per_thread,InputIterator1,Size,InputIterator2,InputIterator3,InputIterator4,OutputIterator,Compare,SetOperation> result_type;
-  return result_type(input_partition_offsets,num_partitions,first1,first2,output_partition_offsets,result,comp,set_op);
-}
-
-
-} // end namespace set_operation_detail
-
-
-template<typename DerivedPolicy, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare, typename SetOperation>
-__host__ __device__
-OutputIterator set_operation(thrust::cuda::execution_policy<DerivedPolicy> &exec,
-                             InputIterator1 first1, InputIterator1 last1,
-                             InputIterator2 first2, InputIterator2 last2,
-                             OutputIterator result,
-                             Compare comp,
-                             SetOperation set_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  using thrust::system::cuda::detail::device_properties;
-  using thrust::system::cuda::detail::detail::launch_closure;
-  namespace d = thrust::system::cuda::detail::detail::set_operation_detail;
-
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference;
-
-  const difference n1 = last1 - first1;
-  const difference n2 = last2 - first2;
-
-  // handle empty input
-  if(n1 == 0 && n2 == 0)
-  {
-    return result;
-  }
-
-  const thrust::detail::uint16_t work_per_thread   = 15;
-  const thrust::detail::uint16_t threads_per_block = 128;
-  const thrust::detail::uint16_t work_per_block    = threads_per_block * work_per_thread;
-
-  // -1 because balanced_path adds a single element to the end of a "starred" partition, increasing its size by one
-  const thrust::detail::uint16_t maximum_partition_size = work_per_block - 1;
-  const difference num_partitions = thrust::detail::util::divide_ri(n1 + n2, maximum_partition_size);
-
-  // find input partition offsets
-  // +1 to handle the end of the input elegantly
-  thrust::detail::temporary_array<thrust::pair<difference,difference>, DerivedPolicy> input_partition_offsets(0, exec, num_partitions + 1);
-  d::find_partition_offsets<difference>(exec, input_partition_offsets.size(), maximum_partition_size, first1, last1, first2, last2, input_partition_offsets.begin(), comp);
-
-  const difference num_blocks = thrust::min<difference>(device_properties().maxGridSize[0], num_partitions);
-
-  // find output partition offsets
-  // +1 to store the total size of the total
-  thrust::detail::temporary_array<difference, DerivedPolicy> output_partition_offsets(0, exec, num_partitions + 1);
-  launch_closure(exec,
-                 d::make_count_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), comp, set_op),
-                 num_blocks,
-                 threads_per_block);
-
-  // turn the output partition counts into offsets to output partitions
-  thrust::exclusive_scan(exec, output_partition_offsets.begin(), output_partition_offsets.end(), output_partition_offsets.begin());
-
-  // run the set op kernel
-  launch_closure(exec,
-                 d::make_set_operation_closure<threads_per_block,work_per_thread>(input_partition_offsets.begin(), num_partitions, first1, first2, output_partition_offsets.begin(), result, comp, set_op),
-                 num_blocks,
-                 threads_per_block);
-
-  return result + output_partition_offsets[num_partitions];
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/detail/stable_merge_sort.h b/thrust/system/cuda/detail/detail/stable_merge_sort.h
deleted file mode 100644
index 953d350c6..000000000
--- a/thrust/system/cuda/detail/detail/stable_merge_sort.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_merge_sort_dev.h
- *  \brief Defines the interface for a stable merge implementation on CUDA
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_merge_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       StrictWeakOrdering comp);
-    
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_begin,
-                              RandomAccessIterator1 keys_end,
-                              RandomAccessIterator2 values_begin,
-                              StrictWeakOrdering comp);
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_merge_sort.inl>
-
diff --git a/thrust/system/cuda/detail/detail/stable_merge_sort.inl b/thrust/system/cuda/detail/detail/stable_merge_sort.inl
deleted file mode 100644
index 12e10b5dd..000000000
--- a/thrust/system/cuda/detail/detail/stable_merge_sort.inl
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/stable_merge_sort.h>
-#include <thrust/system/cuda/detail/detail/stable_sort_each.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/virtualized_smem_closure.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/extern_shared_ptr.h>
-#include <thrust/detail/copy.h>
-#include <thrust/tabulate.h>
-#include <thrust/tuple.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/function.h>
-#include <thrust/detail/integer_math.h>
-#include <thrust/detail/integer_traits.h>
-#include <thrust/detail/seq.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/internal_functional.h>
-#include <thrust/system/cuda/detail/temporary_indirect_permutation.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_merge_sort_detail
-{
-namespace block
-{
-
-
-// block-wise inplace merge for when we have a static bound on the size of the result (block_size * work_per_thread)
-template<unsigned int work_per_thread,
-         typename Context,
-         typename Iterator,
-         typename Size,
-         typename Compare>
-__device__
-void bounded_inplace_merge(Context &ctx, Iterator first, Size n1, Size n2, Compare comp)
-{
-  Iterator first2 = first + n1;
-
-  // don't ask for an out-of-bounds diagonal
-  Size diag = thrust::min<Size>(n1 + n2, work_per_thread * ctx.thread_index());
-
-  Size mp = merge_path(diag, first, n1, first2, n2, comp);
-
-  // compute the ranges of the sources
-  Size start1 = mp;
-  Size start2 = diag - mp;
-
-  Size end1 = n1;
-  Size end2 = n2;
-  
-  // each thread does a local sequential merge
-  typedef typename thrust::iterator_value<Iterator>::type value_type;
-  value_type local_result[work_per_thread];
-  sequential_bounded_merge<work_per_thread>(first  + start1, first  + end1,
-                                            first2 + start2, first2 + end2,
-                                            local_result, comp);
-
-  ctx.barrier();
-
-  // store the result
-  // XXX we unconditionally copy work_per_thread elements here, even if input was partially-sized
-  thrust::copy_n(thrust::seq, local_result, work_per_thread, first + work_per_thread * ctx.thread_index());
-  ctx.barrier();
-}
-
-
-// staged, block-wise merge for when we have a static bound on the size of the result (block_size * work_per_thread)
-template<unsigned int work_per_thread,
-         typename Context,
-         typename Iterator1, typename Size1,
-         typename Iterator2, typename Size2,
-         typename Iterator3,
-         typename Iterator4,
-	 typename Compare>
-__device__
-void staged_bounded_merge(Context &ctx,
-                          Iterator1 first1, Size1 n1,
-                          Iterator2 first2, Size2 n2,
-                          Iterator3 staging_buffer,
-                          Iterator4 result,
-                          Compare comp)
-{
-  // stage the input through the buffer
-  cuda::detail::block::async_copy_n_global_to_shared<work_per_thread>(ctx, first1, n1, staging_buffer);
-  cuda::detail::block::async_copy_n_global_to_shared<work_per_thread>(ctx, first2, n2, staging_buffer + n1);
-  ctx.barrier();
-
-  // cooperatively merge in place
-  block::bounded_inplace_merge<work_per_thread>(ctx, staging_buffer, n1, n2, comp);
-  
-  // store result in buffer to result
-  cuda::detail::block::copy_n(ctx, staging_buffer, n1 + n2, result);
-}
-
-
-} // end block
-
-
-// Returns (start1, end1, start2, end2) into mergesort input lists between mp0 and mp1.
-inline __host__ __device__
-thrust::tuple<int,int,int,int> find_mergesort_interval(int partition_first1, int partition_size, int num_blocks_per_merge, int block_idx, int num_elements_per_block, int n, int mp, int right_mp)
-{
-  int partition_first2 = partition_first1 + partition_size;
-
-  // Locate diag from the start of the A sublist.
-  int diag = num_elements_per_block * block_idx - partition_first1;
-  int start1 = partition_first1 + mp;
-  int end1 = thrust::min<int>(n, partition_first1 + right_mp);
-  int start2 = thrust::min<int>(n, partition_first2 + diag - mp);
-  int end2 = thrust::min<int>(n, partition_first2 + diag + num_elements_per_block - right_mp);
-  
-  // The end partition of the last block for each merge operation is computed
-  // and stored as the begin partition for the subsequent merge. i.e. it is
-  // the same partition but in the wrong coordinate system, so its 0 when it
-  // should be listSize. Correct that by checking if this is the last block
-  // in this merge operation.
-  if(num_blocks_per_merge - 1 == ((num_blocks_per_merge - 1) & block_idx))
-  {
-    end1 = thrust::min<int>(n, partition_first1 + partition_size);
-    end2 = thrust::min<int>(n, partition_first2 + partition_size);
-  }
-
-  return thrust::make_tuple(start1, end1, start2, end2);
-}
-
-
-inline __host__ __device__
-thrust::tuple<int,int,int,int> locate_merge_partitions(int n, int block_idx, int num_blocks_per_merge, int num_elements_per_block, int mp, int right_mp)
-{
-  int first_block_in_partition = ~(num_blocks_per_merge - 1) & block_idx;
-  int partition_size = num_elements_per_block * (num_blocks_per_merge >> 1);
-
-  int partition_first1 = num_elements_per_block * first_block_in_partition;
-
-  return find_mergesort_interval(partition_first1, partition_size, num_blocks_per_merge, block_idx, num_elements_per_block, n, mp, right_mp);
-}
-
-
-template<unsigned int work_per_thread,
-         typename Context,
-         typename Size,
-         typename Iterator1,
-         typename Iterator2,
-         typename Iterator3,
-         typename Compare>
-struct merge_adjacent_partitions_closure
-{
-  typedef Context context_type;
-
-  Size num_blocks_per_merge;
-  Iterator1 first;
-  Size n;
-  Iterator2 merge_paths;
-  Iterator3 result;
-  thrust::detail::wrapped_function<Compare,bool> comp;
-
-
-  __host__ __device__
-  merge_adjacent_partitions_closure(Size num_blocks_per_merge, Iterator1 first, Size n, Iterator2 merge_paths, Iterator3 result, Compare comp)
-    : num_blocks_per_merge(num_blocks_per_merge),
-      first(first),
-      n(n),
-      merge_paths(merge_paths),
-      result(result),
-      comp(comp)
-  {}
-
-
-  template<typename RandomAccessIterator>
-  __thrust_forceinline__ __device__
-  void operator()(RandomAccessIterator staging_buffer)
-  {
-    context_type ctx;
-
-    Size work_per_block = ctx.block_dimension() * work_per_thread;
-    
-    Size start1 = 0, end1 = 0, start2 = 0, end2 = 0;
-
-    thrust::tie(start1,end1,start2,end2) =
-      locate_merge_partitions(n, ctx.block_index(), num_blocks_per_merge, work_per_block, merge_paths[ctx.block_index()], merge_paths[ctx.block_index() + 1]);
-
-    block::staged_bounded_merge<work_per_thread>(ctx,
-                                                 first + start1, end1 - start1,
-                                                 first + start2, end2 - start2,
-                                                 staging_buffer,
-                                                 result + ctx.block_index() * work_per_block,
-                                                 comp);
-  }
-
-
-  __thrust_forceinline__ __device__
-  void operator()()
-  {
-    typedef typename thrust::iterator_value<Iterator1>::type value_type;
-
-    // stage this operation through smem
-    // the size of this array is block_size * (work_per_thread + 1)
-    value_type *s_keys = thrust::system::cuda::detail::extern_shared_ptr<value_type>();
-    
-    this->operator()(s_keys);
-  }
-};
-
-
-template<unsigned int work_per_thread,
-         typename DerivedPolicy,
-         typename Context,
-         typename Size,
-         typename Iterator1,
-         typename Iterator2,
-         typename Pointer,
-         typename Iterator3,
-         typename Compare>
-__host__ __device__
-void merge_adjacent_partitions(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                               Context context,
-                               unsigned int block_size,
-                               Size num_blocks_per_merge,
-                               Iterator1 first,
-                               Size n,
-                               Iterator2 merge_paths,
-                               Pointer virtual_smem,
-                               Iterator3 result,
-                               Compare comp)
-{
-  typedef merge_adjacent_partitions_closure<
-    work_per_thread,
-    Context,
-    Size,
-    Iterator1,
-    Iterator2,
-    Iterator3,
-    Compare
-  > closure_type;
-
-  closure_type closure(num_blocks_per_merge, first, n, merge_paths, result, comp);
-
-  Size num_blocks = thrust::detail::util::divide_ri(n, block_size * work_per_thread);
-
-  typedef typename thrust::iterator_value<Iterator1>::type value_type;
-
-  const size_t num_smem_elements_per_block = block_size * (work_per_thread + 1);
-
-  // XXX this virtualizing code can probably be generalized and moved elsewhere
-  if(virtual_smem)
-  {
-    virtualized_smem_closure<closure_type, Pointer> virtualized_closure(closure, num_smem_elements_per_block, virtual_smem);
-
-    thrust::system::cuda::detail::detail::launch_closure(exec, virtualized_closure, num_blocks, block_size);
-  }
-  else
-  {
-    const size_t num_smem_bytes = num_smem_elements_per_block * sizeof(value_type);
-
-    thrust::system::cuda::detail::detail::launch_closure(exec, closure, num_blocks, block_size, num_smem_bytes);
-  }
-}
-
-
-template<typename Iterator, typename Size, typename Compare>
-struct locate_merge_path
-{
-  Iterator haystack_first;
-  Size haystack_size;
-  Size num_elements_per_block;
-  Size num_blocks_per_merge;
-  thrust::detail::wrapped_function<Compare,bool> comp;
-
-  __host__ __device__
-  locate_merge_path(Iterator haystack_first, Size haystack_size, Size num_elements_per_block, Size num_blocks_per_merge, Compare comp)
-    : haystack_first(haystack_first),
-      haystack_size(haystack_size),
-      num_elements_per_block(num_elements_per_block),
-      num_blocks_per_merge(num_blocks_per_merge),
-      comp(comp)
-  {}
-
-  template<typename Index>
-  __host__ __device__
-  Index operator()(Index merge_path_idx)
-  {
-    // find the index of the first CTA that will participate in the eventual merge
-    Size first_block_in_partition = ~(num_blocks_per_merge - 1) & merge_path_idx;
-
-    // the size of each block's input
-    Size size = num_elements_per_block * (num_blocks_per_merge / 2);
-
-    // find pointers to the two input arrays
-    Size start1 = num_elements_per_block * first_block_in_partition;
-    Size start2 = thrust::min<Size>(haystack_size, start1 + size);
-
-    // the size of each input array
-    // note we clamp to the end of the total input to handle the last partial list
-    Size n1 = thrust::min<Size>(size, haystack_size - start1);
-    Size n2 = thrust::min<Size>(size, haystack_size - start2);
-    
-    // note that diag is computed as an offset from the beginning of the first list
-    Size diag = thrust::min<Size>(n1 + n2, num_elements_per_block * merge_path_idx - start1);
-
-    return merge_path(diag, haystack_first + start1, n1, haystack_first + start2, n2, comp);
-  }
-};
-
-
-template<typename DerivedPolicy, typename Iterator1, typename Size1, typename Iterator2, typename Size2, typename Compare>
-__host__ __device__
-void locate_merge_paths(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                        Iterator1 result,
-                        Size1 n,
-                        Iterator2 haystack_first,
-                        Size2 haystack_size,
-                        Size2 num_elements_per_block,
-                        Size2 num_blocks_per_merge,
-                        Compare comp)
-{
-  locate_merge_path<Iterator2,Size2,Compare> f(haystack_first, haystack_size, num_elements_per_block, num_blocks_per_merge, comp);
-
-  thrust::tabulate(exec, result, result + n, f);
-}
-
-
-template<typename T>
-__host__ __device__
-bool virtualize_smem(size_t num_elements_per_block)
-{
-#ifndef __CUDA_ARCH__
-  size_t num_smem_bytes_required = num_elements_per_block * sizeof(T);
-
-  thrust::system::cuda::detail::device_properties_t props = thrust::system::cuda::detail::device_properties();
-
-  size_t num_smem_bytes_available = props.sharedMemPerBlock;
-  if(props.major == 1)
-  {
-    // pay the kernel parameters tax on Tesla
-    num_smem_bytes_available -= 256;
-  }
-
-  return num_smem_bytes_required > num_smem_bytes_available;
-#else
-  // we should never need to virtualize smem on anything besides Tesla,
-  // and Tesla will never execute this code path
-  return false;
-#endif
-}
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Size, typename Compare>
-__host__ __device__
-void stable_merge_sort_n(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                         RandomAccessIterator first,
-                         Size n,
-                         Compare comp)
-{
-  if(n <= 0) return;
-
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type T;
-
-  const Size block_size = 256;
-
-  typedef thrust::system::cuda::detail::detail::statically_blocked_thread_array<block_size> context_type;
-
-  context_type context;
-
-  const Size work_per_thread = (sizeof(T) < 8) ?  11 : 7;
-  const Size work_per_block = block_size * work_per_thread;
-
-  Size num_blocks = thrust::detail::util::divide_ri(n, work_per_block);
-
-  const unsigned int num_smem_elements_per_block = block_size * (work_per_thread + 1);
-
-  thrust::detail::temporary_array<T,DerivedPolicy> virtual_smem(exec, virtualize_smem<T>(num_smem_elements_per_block) ? (num_blocks * num_smem_elements_per_block) : 0);
-  
-  // depending on the number of passes
-  // we'll either do the initial segmented sort inplace or not
-  // ping being true means the latest data is in the source array
-  bool ping = false;
-  thrust::detail::temporary_array<T,DerivedPolicy> pong_buffer(exec, n);
-
-  Size num_passes = thrust::detail::log2_ri(num_blocks);
-
-  if(thrust::detail::is_odd(num_passes))
-  {
-    stable_sort_each_copy<work_per_thread>(exec, context, block_size, first, first + n, thrust::raw_pointer_cast(&*virtual_smem.begin()), pong_buffer.begin(), comp);
-  }
-  else
-  {
-    stable_sort_each_copy<work_per_thread>(exec, context, block_size, first, first + n, thrust::raw_pointer_cast(&*virtual_smem.begin()), first, comp);
-    ping = true;
-  }
-
-  thrust::detail::temporary_array<Size,DerivedPolicy> merge_paths(exec, num_blocks + 1);
-  
-  for(Size pass = 0; pass < num_passes; ++pass, ping = !ping)
-  {
-    Size num_blocks_per_merge = 2 << pass;
-
-    if(ping)
-    {
-      locate_merge_paths(exec, merge_paths.begin(), merge_paths.size(), first, n, work_per_block, num_blocks_per_merge, comp);
-
-      merge_adjacent_partitions<work_per_thread>(exec, context, block_size, num_blocks_per_merge, first, n, merge_paths.begin(), thrust::raw_pointer_cast(&*virtual_smem.begin()), pong_buffer.begin(), comp);
-    }
-    else
-    {
-      locate_merge_paths(exec, merge_paths.begin(), merge_paths.size(), pong_buffer.begin(), n, work_per_block, num_blocks_per_merge, comp);
-
-      merge_adjacent_partitions<work_per_thread>(exec, context, block_size, num_blocks_per_merge, pong_buffer.begin(), n, merge_paths.begin(), thrust::raw_pointer_cast(&*virtual_smem.begin()), first, comp);
-    }
-  }
-}
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-__host__ __device__
-void stable_merge_sort(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       Compare comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator>::type difference_type;
-
-  difference_type n = last - first;
-
-  // if difference_type is large and n can fit into a 32b uint then use that
-  thrust::detail::uint32_t threshold = thrust::detail::integer_traits<thrust::detail::uint32_t>::const_max;
-  if(sizeof(difference_type) > sizeof(thrust::detail::uint32_t) && n <= difference_type(threshold))
-  {
-    stable_merge_sort_n(exec, first, static_cast<thrust::detail::uint32_t>(n), comp);
-  }
-  else
-  {
-    stable_merge_sort_n(exec, first, n, comp);
-  }
-}
-
-
-} // end namespace stable_merge_sort_detail
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-__host__ __device__
-void stable_merge_sort(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       Compare comp)
-{
-  // decide whether to apply indirection
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  typedef thrust::detail::integral_constant<bool, (sizeof(value_type) > 16)> use_indirection;
-
-  conditional_temporary_indirect_ordering<
-    use_indirection,
-    DerivedPolicy,
-    RandomAccessIterator,
-    Compare
-  > potentially_indirect_range(exec, first, last, comp);
-
-  stable_merge_sort_detail::stable_merge_sort(exec,
-                                              potentially_indirect_range.begin(),
-                                              potentially_indirect_range.end(),
-                                              potentially_indirect_range.comp());
-}
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator1, typename RandomAccessIterator2, typename Compare>
-__host__ __device__
-void stable_merge_sort_by_key(thrust::system::cuda::execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator1 keys_last,
-                              RandomAccessIterator2 values_first,
-                              Compare comp)
-{
-  typedef thrust::tuple<RandomAccessIterator1,RandomAccessIterator2> iterator_tuple;
-  typedef thrust::zip_iterator<iterator_tuple> zip_iterator;
-
-  zip_iterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first));
-  zip_iterator zipped_last = thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first));
-
-  thrust::detail::compare_first<Compare> comp_first(comp);
-
-  stable_merge_sort(exec, zipped_first, zipped_last, comp_first);
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/detail/stable_primitive_sort.h b/thrust/system/cuda/detail/detail/stable_primitive_sort.h
deleted file mode 100644
index ace3e8f40..000000000
--- a/thrust/system/cuda/detail/detail/stable_primitive_sort.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last,
-                           thrust::less<typename thrust::iterator_value<RandomAccessIterator>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last,
-                           thrust::greater<typename thrust::iterator_value<RandomAccessIterator>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first,
-                                  thrust::less<typename thrust::iterator_value<RandomAccessIterator1>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first,
-                                  thrust::greater<typename thrust::iterator_value<RandomAccessIterator1>::type>);
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.inl>
-
diff --git a/thrust/system/cuda/detail/detail/stable_primitive_sort.inl b/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
deleted file mode 100644
index 983dfccda..000000000
--- a/thrust/system/cuda/detail/detail/stable_primitive_sort.inl
+++ /dev/null
@@ -1,248 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.h>
-#include <thrust/functional.h>
-#include <thrust/partition.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_primitive_sort_detail
-{
-
-
-template<typename Iterator>
-  struct enable_if_bool_sort
-    : thrust::detail::enable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-template<typename Iterator>
-  struct disable_if_bool_sort
-    : thrust::detail::disable_if<
-        thrust::detail::is_same<
-          bool,
-          typename thrust::iterator_value<Iterator>::type
-        >::value
-      >
-{};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-typename enable_if_bool_sort<RandomAccessIterator>::type
-  stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator first,
-                        RandomAccessIterator last,
-                        thrust::less<typename thrust::iterator_value<RandomAccessIterator>::type>)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  thrust::stable_partition(exec, first, last, thrust::logical_not<bool>());
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-typename enable_if_bool_sort<RandomAccessIterator>::type
-  stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator first,
-                        RandomAccessIterator last,
-                        thrust::greater<typename thrust::iterator_value<RandomAccessIterator>::type>)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we don't need to logical_not
-  thrust::stable_partition(exec, first, last, thrust::identity<bool>());
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Compare>
-__host__ __device__
-typename disable_if_bool_sort<RandomAccessIterator>::type
-  stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator first,
-                        RandomAccessIterator last,
-                        Compare comp)
-{
-  // call stable_radix_sort
-  thrust::system::cuda::detail::detail::stable_radix_sort(exec,first,last,comp);
-}
-
-
-struct logical_not_first
-{
-  template<typename Tuple>
-  __host__ __device__
-  bool operator()(Tuple t)
-  {
-    return !thrust::get<0>(t);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-typename enable_if_bool_sort<RandomAccessIterator1>::type
-  stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                               RandomAccessIterator1 keys_first,
-                               RandomAccessIterator1 keys_last,
-                               RandomAccessIterator2 values_first,
-                               thrust::less<typename thrust::iterator_value<RandomAccessIterator1>::type>)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to logical_not
-  thrust::stable_partition(exec,
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
-                           logical_not_first());
-}
-
-
-struct first_tuple_element
-{
-  template<typename Tuple>
-  __host__ __device__
-  bool operator()(Tuple t)
-  {
-    return thrust::get<0>(t);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-typename enable_if_bool_sort<RandomAccessIterator1>::type
-  stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                               RandomAccessIterator1 keys_first,
-                               RandomAccessIterator1 keys_last,
-                               RandomAccessIterator2 values_first,
-                               thrust::greater<typename thrust::iterator_value<RandomAccessIterator1>::type>)
-{
-  // use stable_partition if we're sorting bool
-  // stable_partition puts true values first, so we need to just return the first tuple element
-  // i.e., we don't need to use logical_not_first
-  thrust::stable_partition(exec,
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
-                           thrust::make_zip_iterator(thrust::make_tuple(keys_last, values_first)),
-                           first_tuple_element());
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename Compare>
-__host__ __device__
-typename disable_if_bool_sort<RandomAccessIterator1>::type
-  stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                               RandomAccessIterator1 keys_first,
-                               RandomAccessIterator1 keys_last,
-                               RandomAccessIterator2 values_first,
-                               Compare comp)
-{
-  // call stable_radix_sort_by_key
-  thrust::system::cuda::detail::detail::stable_radix_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-}
-  
-
-} // end stable_primitive_sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last,
-                           thrust::less<typename thrust::iterator_value<RandomAccessIterator>::type> comp)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort(exec,first,last, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_primitive_sort(execution_policy<DerivedPolicy> &exec,
-                           RandomAccessIterator first,
-                           RandomAccessIterator last,
-                           thrust::greater<typename thrust::iterator_value<RandomAccessIterator>::type> comp)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort(exec,first,last, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first,
-                                  thrust::less<typename thrust::iterator_value<RandomAccessIterator1>::type> comp)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_primitive_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                                  RandomAccessIterator1 keys_first,
-                                  RandomAccessIterator1 keys_last,
-                                  RandomAccessIterator2 values_first,
-                                  thrust::greater<typename thrust::iterator_value<RandomAccessIterator1>::type> comp)
-{
-  thrust::system::cuda::detail::detail::stable_primitive_sort_detail::stable_primitive_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/detail/stable_radix_sort.h b/thrust/system/cuda/detail/detail/stable_radix_sort.h
deleted file mode 100644
index 01b78c066..000000000
--- a/thrust/system/cuda/detail/detail/stable_radix_sort.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file stable_radix_sort_dev.h
- *  \brief Defines the interface for a stable radix sort implementation on CUDA
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/functional.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       thrust::less<typename thrust::iterator_value<RandomAccessIterator>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       thrust::greater<typename thrust::iterator_value<RandomAccessIterator>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator1 keys_last,
-                              RandomAccessIterator2 values_first,
-                              thrust::less<typename thrust::iterator_value<RandomAccessIterator1>::type>);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator1 keys_last,
-                              RandomAccessIterator2 values_first,
-                              thrust::greater<typename thrust::iterator_value<RandomAccessIterator1>::type>);
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/detail/stable_radix_sort.inl>
-
diff --git a/thrust/system/cuda/detail/detail/stable_radix_sort.inl b/thrust/system/cuda/detail/detail/stable_radix_sort.inl
deleted file mode 100644
index e3fb34c7d..000000000
--- a/thrust/system/cuda/detail/detail/stable_radix_sort.inl
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-// do not attempt to compile this file with any other compiler
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
-#include <thrust/detail/copy.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/system/cuda/detail/cub.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/tuple.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_radix_sort_detail
-{
-
-
-// sort ascending
-template<typename Key>
-__host__ __device__
-cudaError_t cub_sort_keys_wrapper(void *d_temp_storage,
-                                  size_t &temp_storage_bytes,
-                                  cub_::DoubleBuffer<Key> &d_keys,
-                                  int num_items,
-                                  thrust::less<Key> comp,
-                                  int begin_bit = 0,
-                                  int end_bit = sizeof(Key) * 8,
-                                  cudaStream_t stream = 0,
-                                  bool debug_synchronous = false)
-{
-  struct workaround
-  {
-    __host__ 
-    static cudaError_t host_path(void *d_temp_storage,
-                                 size_t &temp_storage_bytes,
-                                 cub_::DoubleBuffer<Key> &d_keys,
-                                 int num_items,
-                                 thrust::less<Key>,
-                                 int begin_bit,
-                                 int end_bit,
-                                 cudaStream_t stream,
-                                 bool debug_synchronous)
-    {
-      return cub_::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream, debug_synchronous);
-    }
-
-    __device__
-    static cudaError_t device_path(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   int num_items,
-                                   thrust::less<Key>,
-                                   int begin_bit,
-                                   int end_bit,
-                                   cudaStream_t stream,
-                                   bool debug_synchronous)
-    {
-#if __BULK_HAS_CUDART__
-      return cub_::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream, debug_synchronous);
-#else
-      return cudaErrorNotSupported;
-#endif
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return workaround::host_path(d_temp_storage, temp_storage_bytes, d_keys, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#else
-  return workaround::device_path(d_temp_storage, temp_storage_bytes, d_keys, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#endif
-}
-
-
-// sort descending
-template<typename Key>
-__host__ __device__
-cudaError_t cub_sort_keys_wrapper(void *d_temp_storage,
-                                  size_t &temp_storage_bytes,
-                                  cub_::DoubleBuffer<Key> &d_keys,
-                                  int num_items,
-                                  thrust::greater<Key> comp,
-                                  int begin_bit = 0,
-                                  int end_bit = sizeof(Key) * 8,
-                                  cudaStream_t stream = 0,
-                                  bool debug_synchronous = false)
-{
-  struct workaround
-  {
-    __host__ 
-    static cudaError_t host_path(void *d_temp_storage,
-                                 size_t &temp_storage_bytes,
-                                 cub_::DoubleBuffer<Key> &d_keys,
-                                 int num_items,
-                                 thrust::greater<Key>,
-                                 int begin_bit,
-                                 int end_bit,
-                                 cudaStream_t stream,
-                                 bool debug_synchronous)
-    {
-      return cub_::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream, debug_synchronous);
-    }
-
-    __device__
-    static cudaError_t device_path(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   int num_items,
-                                   thrust::greater<Key>,
-                                   int begin_bit,
-                                   int end_bit,
-                                   cudaStream_t stream,
-                                   bool debug_synchronous)
-    {
-#if __BULK_HAS_CUDART__
-      return cub_::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items, begin_bit, end_bit, stream, debug_synchronous);
-#else
-      return cudaErrorNotSupported;
-#endif
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return workaround::host_path(d_temp_storage, temp_storage_bytes, d_keys, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#else
-  return workaround::device_path(d_temp_storage, temp_storage_bytes, d_keys, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#endif
-}
-
-
-// returns 1. the total size of temporary storage required for a key sort
-//         2. an offset to the "d_temp_storage" parameter for CUB's sort
-//         3. the value of the "temp_storage_bytes" parameter for CUB's sort
-template<typename T, typename Compare>
-__host__ __device__
-thrust::tuple<size_t, size_t, size_t> compute_temporary_storage_requirements_for_radix_sort_n(size_t n, Compare comp, cudaStream_t stream)
-{
-  cub_::DoubleBuffer<T> dummy;
-
-  // measure the number of additional temporary storage bytes required
-  size_t num_additional_temp_storage_bytes = 0;
-  thrust::system::cuda::detail::throw_on_error(cub_sort_keys_wrapper(0, num_additional_temp_storage_bytes, dummy, static_cast<int>(n), comp, 0, sizeof(T)*8, stream),
-                                               "after cub_::DeviceRadixSort::SortKeys(0)");
-
-  // XXX the additional temporary storage bytes
-  //     must be allocated on a 16b aligned address
-  typedef typename bulk_::detail::aligned_type<16>::type aligned_type;
-
-  size_t num_double_buffer_bytes = n * sizeof(T);
-  size_t num_aligned_double_buffer_bytes = thrust::detail::util::round_i(num_double_buffer_bytes, sizeof(aligned_type));
-  size_t num_aligned_total_temporary_storage_bytes = num_aligned_double_buffer_bytes + num_additional_temp_storage_bytes;
-
-  return thrust::make_tuple(num_aligned_total_temporary_storage_bytes, num_aligned_double_buffer_bytes, num_additional_temp_storage_bytes);
-}
-
-
-template<typename DerivedPolicy, typename T, typename Compare>
-__host__ __device__
-void stable_radix_sort_n(execution_policy<DerivedPolicy> &exec, T* first, size_t n, Compare comp)
-{
-  if(n > 1)
-  {
-    cudaStream_t s = stream(thrust::detail::derived_cast<DerivedPolicy>(exec));
-
-    // compute temporary storage requirements
-    size_t num_temporary_storage_bytes = 0;
-    size_t offset_to_additional_temp_storage = 0;
-    size_t num_additional_temp_storage_bytes = 0;
-    thrust::tie(num_temporary_storage_bytes, offset_to_additional_temp_storage, num_additional_temp_storage_bytes) =
-      compute_temporary_storage_requirements_for_radix_sort_n<T>(n, comp, s);
-
-    // allocate storage
-    thrust::detail::temporary_array<char,DerivedPolicy> temporary_storage(exec, num_temporary_storage_bytes);
-
-    // set up double buffer
-    cub_::DoubleBuffer<T> double_buffer;
-    double_buffer.d_buffers[0] = thrust::raw_pointer_cast(&*first);
-    double_buffer.d_buffers[1] = reinterpret_cast<T*>(reinterpret_cast<void*>(thrust::raw_pointer_cast(&temporary_storage[0])));
-
-    thrust::system::cuda::detail::throw_on_error(cub_sort_keys_wrapper(thrust::raw_pointer_cast(&temporary_storage[offset_to_additional_temp_storage]),
-                                                                       num_additional_temp_storage_bytes,
-                                                                       double_buffer,
-                                                                       static_cast<int>(n),
-                                                                       comp,
-                                                                       0,
-                                                                       sizeof(T)*8,
-                                                                       s),
-                                                 "after cub_::DeviceRadixSort::SortKeys(1)");
-
-    thrust::system::cuda::detail::synchronize_if_enabled("stable_radix_sort_n(): after cub_::DeviceRadixSort::SortKeys(1)");
-
-    if(double_buffer.selector != 0)
-    {
-      T* temp_ptr = reinterpret_cast<T*>(double_buffer.d_buffers[1]);
-      thrust::copy(exec, temp_ptr, temp_ptr + n, first);
-    }
-  }
-}
-
-
-} // end namespace stable_radix_sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       thrust::less<typename thrust::iterator_value<RandomAccessIterator>::type> comp)
-{
-  stable_radix_sort_detail::stable_radix_sort_n(exec, thrust::raw_pointer_cast(&*first), last - first, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator>
-__host__ __device__
-void stable_radix_sort(execution_policy<DerivedPolicy> &exec,
-                       RandomAccessIterator first,
-                       RandomAccessIterator last,
-                       thrust::greater<typename thrust::iterator_value<RandomAccessIterator>::type> comp)
-{
-  stable_radix_sort_detail::stable_radix_sort_n(exec, thrust::raw_pointer_cast(&*first), last - first, comp);
-}
-
-
-///////////////////////
-// Key-Value Sorting //
-///////////////////////
-
-
-namespace stable_radix_sort_detail
-{
-
-
-// sort ascending
-template<typename Key, typename Value>
-__host__ __device__
-cudaError_t cub_sort_pairs_wrapper(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   cub_::DoubleBuffer<Value> &d_values,
-                                   int num_items,
-                                   thrust::less<Key> comp,
-                                   int begin_bit = 0,
-                                   int end_bit = sizeof(Key) * 8,
-                                   cudaStream_t stream = 0,
-                                   bool debug_synchronous = false)
-{
-  struct workaround
-  {
-    __host__ 
-    static cudaError_t host_path(void *d_temp_storage,
-                                 size_t &temp_storage_bytes,
-                                 cub_::DoubleBuffer<Key> &d_keys,
-                                 cub_::DoubleBuffer<Value> &d_values,
-                                 int num_items,
-                                 thrust::less<Key>,
-                                 int begin_bit,
-                                 int end_bit,
-                                 cudaStream_t stream,
-                                 bool debug_synchronous)
-    {
-      return cub_::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, debug_synchronous);
-    }
-
-    __device__
-    static cudaError_t device_path(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   cub_::DoubleBuffer<Value> &d_values,
-                                   int num_items,
-                                   thrust::less<Key>,
-                                   int begin_bit,
-                                   int end_bit,
-                                   cudaStream_t stream,
-                                   bool debug_synchronous)
-    {
-#if __BULK_HAS_CUDART__
-      return cub_::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, debug_synchronous);
-#else
-      return cudaErrorNotSupported;
-#endif
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return workaround::host_path(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#else
-  return workaround::device_path(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#endif
-}
-
-
-// sort descending
-template<typename Key, typename Value>
-__host__ __device__
-cudaError_t cub_sort_pairs_wrapper(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   cub_::DoubleBuffer<Value> &d_values,
-                                   int num_items,
-                                   thrust::greater<Key> comp,
-                                   int begin_bit = 0,
-                                   int end_bit = sizeof(Key) * 8,
-                                   cudaStream_t stream = 0,
-                                   bool debug_synchronous = false)
-{
-  struct workaround
-  {
-    __host__ 
-    static cudaError_t host_path(void *d_temp_storage,
-                                 size_t &temp_storage_bytes,
-                                 cub_::DoubleBuffer<Key> &d_keys,
-                                 cub_::DoubleBuffer<Value> &d_values,
-                                 int num_items,
-                                 thrust::greater<Key>,
-                                 int begin_bit,
-                                 int end_bit,
-                                 cudaStream_t stream,
-                                 bool debug_synchronous)
-    {
-      return cub_::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, debug_synchronous);
-    }
-
-    __device__
-    static cudaError_t device_path(void *d_temp_storage,
-                                   size_t &temp_storage_bytes,
-                                   cub_::DoubleBuffer<Key> &d_keys,
-                                   cub_::DoubleBuffer<Value> &d_values,
-                                   int num_items,
-                                   thrust::greater<Key>,
-                                   int begin_bit,
-                                   int end_bit,
-                                   cudaStream_t stream,
-                                   bool debug_synchronous)
-    {
-#if __BULK_HAS_CUDART__
-      return cub_::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, begin_bit, end_bit, stream, debug_synchronous);
-#else
-      return cudaErrorNotSupported;
-#endif
-    }
-  };
-
-#ifndef __CUDA_ARCH__
-  return workaround::host_path(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#else
-  return workaround::device_path(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items, comp, begin_bit, end_bit, stream, debug_synchronous);
-#endif
-}
-
-
-// returns 1. the total size of temporary storage required for a key sort
-//         2. an offset to the double buffer for values
-//         3. an offset to the "d_temp_storage" parameter for CUB's sort
-//         4. the value of the "temp_storage_bytes" parameter for CUB's sort
-template<typename Key, typename Value, typename Compare>
-__host__ __device__
-thrust::tuple<size_t, size_t, size_t, size_t> compute_temporary_storage_requirements_for_radix_sort_by_key_n(size_t n, Compare comp, cudaStream_t stream)
-{
-  cub_::DoubleBuffer<Key> dummy_keys;
-  cub_::DoubleBuffer<Value> dummy_values;
-
-  // measure the number of additional temporary storage bytes required
-  size_t num_additional_temp_storage_bytes = 0;
-  thrust::system::cuda::detail::throw_on_error(cub_sort_pairs_wrapper(0, num_additional_temp_storage_bytes, dummy_keys, dummy_values, static_cast<int>(n), comp, 0, sizeof(Key)*8, stream),
-                                               "after cub_::DeviceRadixSort::SortPairs(0)");
-
-  // XXX the additional temporary storage bytes
-  //     must be allocated on a 16b aligned address
-  typedef typename bulk_::detail::aligned_type<16>::type aligned_type;
-
-  size_t num_keys_double_buffer_bytes = n * sizeof(Key);
-
-  // align up the allocation for the keys double buffer
-  size_t num_aligned_keys_double_buffer_bytes = thrust::detail::util::round_i(num_keys_double_buffer_bytes, sizeof(aligned_type));
-
-  size_t num_values_double_buffer_bytes = n * sizeof(Value);
-
-  // align up the allocation for both double buffers
-  size_t num_aligned_double_buffer_bytes = thrust::detail::util::round_i(num_aligned_keys_double_buffer_bytes + num_values_double_buffer_bytes, sizeof(aligned_type));
-
-  size_t num_aligned_total_temporary_storage_bytes = num_aligned_double_buffer_bytes + num_additional_temp_storage_bytes;
-
-  return thrust::make_tuple(num_aligned_total_temporary_storage_bytes, num_aligned_keys_double_buffer_bytes, num_aligned_double_buffer_bytes, num_additional_temp_storage_bytes);
-}
-
-
-// sort values directly
-template<typename DerivedPolicy,
-         typename Key,
-         typename Value,
-         typename Compare>
-__host__ __device__
-void stable_radix_sort_by_key_n(execution_policy<DerivedPolicy> &exec,
-                                Key* first1,
-                                size_t n,
-                                Value* first2,
-                                Compare comp)
-{
-  if(n > 1)
-  {
-    cudaStream_t s = stream(thrust::detail::derived_cast<DerivedPolicy>(exec));
-
-    // compute temporary storage requirements
-    size_t num_temporary_storage_bytes = 0;
-    size_t offset_to_values_buffer = 0;
-    size_t offset_to_additional_temp_storage = 0;
-    size_t num_additional_temp_storage_bytes = 0;
-    thrust::tie(num_temporary_storage_bytes, offset_to_values_buffer, offset_to_additional_temp_storage, num_additional_temp_storage_bytes) =
-      compute_temporary_storage_requirements_for_radix_sort_by_key_n<Key,Value>(n, comp, s);
-
-    // allocate storage
-    thrust::detail::temporary_array<char,DerivedPolicy> temporary_storage(exec, num_temporary_storage_bytes);
-
-    // set up double buffers
-    cub_::DoubleBuffer<Key> double_buffer_keys;
-    double_buffer_keys.d_buffers[0] = thrust::raw_pointer_cast(&*first1);
-    double_buffer_keys.d_buffers[1] = reinterpret_cast<Key*>(reinterpret_cast<void*>(thrust::raw_pointer_cast(&temporary_storage[0])));
-
-    cub_::DoubleBuffer<Value> double_buffer_values;
-    double_buffer_values.d_buffers[0] = thrust::raw_pointer_cast(&*first2);
-    double_buffer_values.d_buffers[1] = reinterpret_cast<Value*>(reinterpret_cast<void*>(thrust::raw_pointer_cast(&temporary_storage[offset_to_values_buffer])));
-
-    thrust::system::cuda::detail::throw_on_error(cub_sort_pairs_wrapper(thrust::raw_pointer_cast(&temporary_storage[offset_to_additional_temp_storage]),
-                                                                        num_additional_temp_storage_bytes,
-                                                                        double_buffer_keys,
-                                                                        double_buffer_values,
-                                                                        static_cast<int>(n),
-                                                                        comp,
-                                                                        0,
-                                                                        sizeof(Key)*8,
-                                                                        s),
-                                                 "after cub_::DeviceRadixSort::SortPairs(1)");
-
-    thrust::system::cuda::detail::synchronize_if_enabled("stable_radix_sort_by_key_n(): after cub_::DeviceRadixSort::SortPairs(1)");
-
-    if(double_buffer_keys.selector != 0)
-    {
-      Key* temp_ptr = reinterpret_cast<Key*>(double_buffer_keys.d_buffers[1]);
-      thrust::copy(exec, temp_ptr, temp_ptr + n, first1);
-    }
-
-    if(double_buffer_values.selector != 0)
-    {
-      Value* temp_ptr = reinterpret_cast<Value*>(double_buffer_values.d_buffers[1]);
-      thrust::copy(exec, temp_ptr, temp_ptr + n, first2);
-    }
-  }
-}
-
-
-} // end stable_radix_sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              thrust::less<typename thrust::iterator_value<RandomAccessIterator1>::type> comp)
-{
-  stable_radix_sort_detail::stable_radix_sort_by_key_n(exec,
-                                                       thrust::raw_pointer_cast(&*first1),
-                                                       last1 - first1,
-                                                       thrust::raw_pointer_cast(&*first2),
-                                                       comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2>
-__host__ __device__
-void stable_radix_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 first1,
-                              RandomAccessIterator1 last1,
-                              RandomAccessIterator2 first2,
-                              thrust::greater<typename thrust::iterator_value<RandomAccessIterator1>::type> comp)
-{
-  stable_radix_sort_detail::stable_radix_sort_by_key_n(exec,
-                                                       thrust::raw_pointer_cast(&*first1),
-                                                       last1 - first1,
-                                                       thrust::raw_pointer_cast(&*first2),
-                                                       comp);
-}
-
-
-} // end namespace detail
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-
-#endif // THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-
diff --git a/thrust/system/cuda/detail/detail/stable_sort_each.h b/thrust/system/cuda/detail/detail/stable_sort_each.h
deleted file mode 100644
index 9ebc39c88..000000000
--- a/thrust/system/cuda/detail/detail/stable_sort_each.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<unsigned int work_per_thread,
-         typename DerivedPolicy,
-         typename Context,
-         typename RandomAccessIterator1,
-         typename Pointer,
-         typename RandomAccessIterator2,
-         typename Compare>
-__host__ __device__
-void stable_sort_each_copy(execution_policy<DerivedPolicy> &exec,
-                           Context context,
-                           unsigned int block_size,
-                           RandomAccessIterator1 first, RandomAccessIterator1 last,
-                           Pointer vitual_smem,
-                           RandomAccessIterator2 result,
-                           Compare comp);
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/detail/stable_sort_each.inl>
-
diff --git a/thrust/system/cuda/detail/detail/stable_sort_each.inl b/thrust/system/cuda/detail/detail/stable_sort_each.inl
deleted file mode 100644
index 44d61e424..000000000
--- a/thrust/system/cuda/detail/detail/stable_sort_each.inl
+++ /dev/null
@@ -1,337 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/stable_sort_each.h>
-#include <thrust/system/cuda/detail/block/copy.h>
-#include <thrust/system/cuda/detail/detail/merge.h>
-#include <thrust/system/cuda/detail/extern_shared_ptr.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/detail/swap.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/integer_math.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/detail/virtualized_smem_closure.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-namespace stable_sort_each_detail
-{
-namespace static_stable_odd_even_transpose_sort_detail
-{
-
-
-template<int i, int n>
-struct impl
-{
-  template<typename Iterator, typename Compare>
-  static __device__
-  void do_it(Iterator keys, Compare comp)
-  {
-    for(int j = 1 & i; j < n - 1; j += 2)
-    {
-      if(comp(keys[j + 1], keys[j]))
-      {
-        using thrust::swap;
-
-      	swap(keys[j], keys[j + 1]);
-      }
-    }
-
-    impl<i + 1, n>::do_it(keys, comp);
-  }
-};
-
-
-template<int i>
-struct impl<i,i>
-{
-  template<typename Iterator, typename Compare>
-  static __device__
-  void do_it(Iterator, Compare) {}
-};
-
-
-} // end static_stable_odd_even_transpose_sort_detail
-
-
-template<int n, typename RandomAccessIterator, typename Compare>
-__device__
-void static_stable_sort(RandomAccessIterator keys, Compare comp)
-{
-  static_stable_odd_even_transpose_sort_detail::impl<0,n>::do_it(keys, comp);
-}
-
-
-// sequential copy_n for when we have a static bound on the value of n
-template<unsigned int bound_n, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2>
-__device__
-void bounded_copy_n(RandomAccessIterator1 first, Size n, RandomAccessIterator2 result)
-{
-  for(unsigned int i = 0; i < bound_n; ++i)
-  {
-    if(i < n)
-    {
-      result[i] = first[i];
-    }
-  }
-}
-
-
-namespace block
-{
-
-
-template<unsigned int work_per_thread, typename Context, typename Iterator, typename Size, typename Compare>
-__device__
-void bounded_inplace_merge_adjacent_partitions(Context &ctx,
-                                               Iterator first,
-                                               Size n,
-                                               Compare comp)
-{
-  typedef typename thrust::iterator_value<Iterator>::type value_type;
-
-  for(Size num_threads_per_merge = 2; num_threads_per_merge <= ctx.block_dimension(); num_threads_per_merge *= 2)
-  {
-    // find the index of the first array this thread will merge
-    Size list = ~(num_threads_per_merge - 1) & ctx.thread_index();
-    Size diag = thrust::min<Size>(n, work_per_thread * ((num_threads_per_merge - 1) & ctx.thread_index()));
-    Size input_start = work_per_thread * list;
-
-    // the size of each of the two input arrays we're merging
-    Size input_size = work_per_thread * (num_threads_per_merge / 2);
-
-    // find the limits of the partitions of the input this group of threads will merge
-    Size partition_first1 = thrust::min<Size>(n, input_start);
-    Size partition_first2 = thrust::min<Size>(n, partition_first1 + input_size); 
-    Size partition_last2  = thrust::min<Size>(n, partition_first2 + input_size);
-
-    Size n1 = partition_first2 - partition_first1;
-    Size n2 = partition_last2  - partition_first2;
-
-    Size mp = merge_path(diag, first + partition_first1, n1, first + partition_first2, n2, comp);
-
-    // each thread merges sequentially locally
-    value_type local_result[work_per_thread];
-    sequential_bounded_merge<work_per_thread>(first + partition_first1 + mp,        first + partition_first2,
-                                              first + partition_first2 + diag - mp, first + partition_last2,
-                                              local_result,
-                                              comp);
-
-    ctx.barrier();
-
-    // compute the size of the local result to account for the final, partial tile
-    Size local_result_size = thrust::min<Size>(work_per_thread, n - (ctx.thread_index() * work_per_thread));
-
-    // store local results
-    bounded_copy_n<work_per_thread>(local_result, local_result_size, first + ctx.thread_index() * work_per_thread);
-
-    ctx.barrier();
-  }
-}
-
-
-template<unsigned int work_per_thread, typename Context, typename RandomAccessIterator, typename Size, typename Compare>
-__device__
-void bounded_stable_sort(Context &ctx,
-                         RandomAccessIterator first,
-                         Size n,
-                         Compare comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-
-  // compute the size of this thread's local tile to account for the final, partial tile
-  Size local_tile_size = work_per_thread;
-  if(work_per_thread * (ctx.thread_index() + 1) > n)
-  {
-    local_tile_size = thrust::max<Size>(0, n - (work_per_thread * ctx.thread_index()));
-  }
-
-  // each thread creates a local copy of its partition of the array
-  value_type local_keys[work_per_thread];
-  bounded_copy_n<work_per_thread>(first + ctx.thread_index() * work_per_thread, local_tile_size, local_keys);
-  
-  // if we're in the final partial tile, fill the remainder of the local_keys with with the max value
-  if(local_tile_size < work_per_thread)
-  {
-    value_type max_key = local_keys[0];
-
-    for(unsigned int i = 1; i < work_per_thread; ++i)
-    {
-      if(i < local_tile_size)
-      {
-        max_key = comp(max_key, local_keys[i]) ? local_keys[i] : max_key;
-      }
-    }
-    
-    // fill in the remainder with max_key
-    for(unsigned int i = 0; i < work_per_thread; ++i)
-    {
-      if(i >= local_tile_size)
-      {
-        local_keys[i] = max_key;
-      }
-    }
-  }
-
-  // stable sort the keys in the thread.
-  if(work_per_thread * ctx.thread_index() < n)
-  {
-    static_stable_sort<work_per_thread>(local_keys, comp);
-  }
-  
-  // Store the locally sorted keys into shared memory.
-  bounded_copy_n<work_per_thread>(local_keys, local_tile_size, first + ctx.thread_index() * work_per_thread);
-  ctx.barrier();
-
-  block::bounded_inplace_merge_adjacent_partitions<work_per_thread>(ctx, first, n, comp);
-}
-
-
-} // end block
-
-
-template<unsigned int work_per_thread,
-         typename Context,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2,
-         typename Compare>
-struct stable_sort_each_copy_closure
-{
-  typedef Context context_type;
-
-  RandomAccessIterator1 first;
-  Size n;
-  RandomAccessIterator2 result;
-  thrust::detail::wrapped_function<Compare,bool> comp;
-
-  __host__ __device__
-  stable_sort_each_copy_closure(RandomAccessIterator1 first, Size n, RandomAccessIterator2 result, Compare comp)
-    : first(first),
-      n(n),
-      result(result),
-      comp(comp)
-  {}
-
-
-  template<typename RandomAccessIterator>
-  __device__ __thrust_forceinline__
-  void operator()(RandomAccessIterator staging_buffer)
-  {
-    context_type ctx;
-
-    unsigned int work_per_block = ctx.block_dimension() * work_per_thread;
-    unsigned int offset = work_per_block * ctx.block_index();
-    unsigned int tile_size = thrust::min<unsigned int>(work_per_block, n - offset);
-    
-    // load input tile into buffer
-    thrust::system::cuda::detail::block::copy_n_global_to_shared<work_per_thread>(ctx, first + offset, tile_size, staging_buffer);
-
-    // sort input in buffer
-    block::bounded_stable_sort<work_per_thread>(ctx, staging_buffer, tile_size, comp);
-    
-    // store result to gmem
-    thrust::system::cuda::detail::block::copy_n(ctx, staging_buffer, tile_size, result + offset);
-  }
-
-
-  __device__ __thrust_forceinline__
-  void operator()()
-  {
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-    // stage this operation through smem
-    // the size of this array is block_size * (work_per_thread + 1)
-    value_type *s_keys = thrust::system::cuda::detail::extern_shared_ptr<value_type>();
-    
-    this->operator()(s_keys);
-  }
-};
-
-
-} // end namespace stable_sort_each_detail
-
-
-template<unsigned int work_per_thread,
-         typename DerivedPolicy,
-         typename Context,
-         typename RandomAccessIterator1,
-         typename Pointer,
-         typename RandomAccessIterator2,
-         typename Compare>
-__host__ __device__
-void stable_sort_each_copy(execution_policy<DerivedPolicy> &exec,
-                           Context context,
-                           unsigned int block_size,
-                           RandomAccessIterator1 first, RandomAccessIterator1 last,
-                           Pointer virtual_smem,
-                           RandomAccessIterator2 result,
-                           Compare comp)
-{
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference_type;
-
-  difference_type n = last - first;
-
-  int num_blocks = thrust::detail::util::divide_ri(n, block_size * work_per_thread);
-
-  typedef stable_sort_each_detail::stable_sort_each_copy_closure<
-    work_per_thread,
-    Context,
-    RandomAccessIterator1,
-    difference_type,
-    RandomAccessIterator2,
-    Compare
-  > closure_type;
-
-  closure_type closure(first, n, result, comp);
-  
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-  const size_t num_smem_elements_per_block = block_size * (work_per_thread + 1);
-
-  // XXX this virtualizing code can probably be generalized and moved elsewhere
-  if(virtual_smem)
-  {
-    virtualized_smem_closure<closure_type, Pointer> virtualized_closure(closure, num_smem_elements_per_block, virtual_smem);
-
-    thrust::system::cuda::detail::detail::launch_closure(exec, virtualized_closure, num_blocks, block_size);
-  }
-  else
-  {
-    const size_t num_smem_bytes = num_smem_elements_per_block * sizeof(value_type);
-
-    thrust::system::cuda::detail::detail::launch_closure(exec, closure, num_blocks, block_size, num_smem_bytes);
-  }
-}
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/uninitialized.h b/thrust/system/cuda/detail/detail/uninitialized.h
deleted file mode 100644
index 6d0806eb5..000000000
--- a/thrust/system/cuda/detail/detail/uninitialized.h
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/detail/alignment.h>
-#include <cstddef>
-#include <new>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename T>
-  class uninitialized
-{
-  private:
-    typename aligned_storage<
-      sizeof(T),
-      alignment_of<T>::value
-    >::type storage;
-
-    __host__ __device__ __thrust_forceinline__
-    const T* ptr() const
-    {
-      return reinterpret_cast<const T*>(storage.data);
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    T* ptr()
-    {
-      return reinterpret_cast<T*>(storage.data);
-    }
-
-  public:
-    // copy assignment
-    __host__ __device__ __thrust_forceinline__
-    uninitialized<T> &operator=(const T &other)
-    {
-      T& self = *this;
-      self = other;
-      return *this;
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    T& get()
-    {
-      return *ptr();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    const T& get() const
-    {
-      return *ptr();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    operator T& ()
-    {
-      return get();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    operator const T&() const
-    {
-      return get();
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    void construct()
-    {
-      ::new(ptr()) T();
-    }
-
-    template<typename Arg>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg &a)
-    {
-      ::new(ptr()) T(a);
-    }
-
-    template<typename Arg1, typename Arg2>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2)
-    {
-      ::new(ptr()) T(a1,a2);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3)
-    {
-      ::new(ptr()) T(a1,a2,a3);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9);
-    }
-
-    template<typename Arg1, typename Arg2, typename Arg3, typename Arg4, typename Arg5, typename Arg6, typename Arg7, typename Arg8, typename Arg9, typename Arg10>
-    __host__ __device__ __thrust_forceinline__
-    void construct(const Arg1 &a1, const Arg2 &a2, const Arg3 &a3, const Arg4 &a4, const Arg5 &a5, const Arg6 &a6, const Arg7 &a7, const Arg8 &a8, const Arg9 &a9, const Arg10 &a10)
-    {
-      ::new(ptr()) T(a1,a2,a3,a4,a5,a6,a7,a8,a9,a10);
-    }
-
-    __host__ __device__ __thrust_forceinline__
-    void destroy()
-    {
-      T& self = *this;
-      self.~T();
-    }
-};
-
-
-template<typename T, std::size_t N>
-  class uninitialized_array
-{
-  public:
-    typedef T             value_type; 
-    typedef T&            reference;
-    typedef const T&      const_reference;
-    typedef T*            pointer;
-    typedef const T*      const_pointer;
-    typedef pointer       iterator;
-    typedef const_pointer const_iterator;
-    typedef std::size_t   size_type;
-
-    __thrust_forceinline__ __host__ __device__
-    iterator begin()
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator begin() const
-    {
-      return data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    iterator end()
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator end() const
-    {
-      return begin() + size();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator cbegin() const
-    {
-      return begin();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_iterator cend() const
-    {
-      return end();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    size_type size() const
-    {
-      return N;
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    bool empty() const
-    {
-      return false;
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    T* data()
-    {
-      return impl.get();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const T* data() const
-    {
-      return impl.get();
-    }
-
-    // element access
-    __thrust_forceinline__ __host__ __device__
-    reference operator[](size_type n)
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference operator[](size_type n) const
-    {
-      return data()[n];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    reference front()
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference front() const
-    {
-      return *data();
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    reference back()
-    {
-      return data()[size() - size_type(1)];
-    }
-
-    __thrust_forceinline__ __host__ __device__
-    const_reference back() const
-    {
-      return data()[size() - size_type(1)];
-    }
-
-  private:
-    uninitialized<T[N]> impl;
-};
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/detail/virtualized_smem_closure.h b/thrust/system/cuda/detail/detail/virtualized_smem_closure.h
deleted file mode 100644
index 185fd5c11..000000000
--- a/thrust/system/cuda/detail/detail/virtualized_smem_closure.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace detail
-{
-
-
-template<typename Closure, typename RandomAccessIterator>
-  struct virtualized_smem_closure
-    : Closure
-{
-  typedef Closure super_t;
-
-  size_t num_elements_per_block;
-  RandomAccessIterator virtual_smem;
-
-  __host__ __device__ __thrust_forceinline__
-  virtualized_smem_closure(Closure closure, size_t num_elements_per_block, RandomAccessIterator virtual_smem)
-    : super_t(closure),
-      num_elements_per_block(num_elements_per_block),
-      virtual_smem(virtual_smem)
-  {}
-
-  __device__ __thrust_forceinline__
-  void operator()()
-  {
-    typename super_t::context_type ctx;
-
-    RandomAccessIterator smem = virtual_smem + num_elements_per_block * ctx.block_index();
-
-    super_t::operator()(smem);
-  }
-};
-
-
-} // end detail
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
new file mode 100644
index 000000000..d0e3f94ec
--- /dev/null
+++ b/thrust/system/cuda/detail/dispatch.h
@@ -0,0 +1,79 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/preprocessor.h>
+#include <thrust/detail/integer_traits.h>
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH(status, call, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int32_t>(count); \
+        status = call arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+        status = call arguments; \
+    }
+
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version assumes that callables for both branches consist
+ * of the same tokens, and is intended to be used with Thrust-style dispatch
+ * interfaces, that always deduce the size type from the arguments.
+ *
+ * This version of the macro supports providing two count variables, which is
+ * necessary for set algorithms.
+ */
+#define THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, call, count1, count2, arguments) \
+    if (count1 + count2 <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int32_t>(count1); \
+        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int32_t>(count2); \
+        status = call arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
+        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
+        status = call arguments; \
+    }
+/**
+ * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+ * implementation. This version allows using different token sequences for callables
+ * in both branches, and is intended to be used with CUB-style dispatch interfaces,
+ * where the "simple" interface always forces the size to be `int` (making it harder
+ * for us to use), but the complex interface that we end up using doesn't actually
+ * provide a way to fully deduce the type from just the call, making the size type
+ * appear in the token sequence of the callable.
+ *
+ * See reduce_n_impl to see an example of how this is meant to be used.
+ */
+#define THRUST_INDEX_TYPE_DISPATCH2(status, call_32, call_64, count, arguments) \
+    if (count <= thrust::detail::integer_traits<thrust::detail::int32_t>::const_max) { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int32_t>(count); \
+        status = call_32 arguments; \
+    } \
+    else { \
+        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+        status = call_64 arguments; \
+    }
+
diff --git a/thrust/system/cuda/detail/equal.h b/thrust/system/cuda/detail/equal.h
index c6ae90664..aec608245 100644
--- a/thrust/system/cuda/detail/equal.h
+++ b/thrust/system/cuda/detail/equal.h
@@ -1,22 +1,74 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/mismatch.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+bool __host__ __device__
+equal(execution_policy<Derived>& policy,
+      InputIt1                   first1,
+      InputIt1                   last1,
+      InputIt2                   first2,
+      BinaryPred                 binary_pred)
+{
+  return cuda_cub::mismatch(policy, first1, last1, first2, binary_pred).first == last1;
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+bool __host__ __device__
+equal(execution_policy<Derived>& policy,
+      InputIt1                   first1,
+      InputIt1                   last1,
+      InputIt2                   first2)
+{
+  typedef typename thrust::iterator_value<InputIt1>::type InputType1;
+  return cuda_cub::equal(policy,
+                         first1,
+                         last1,
+                         first2,
+                         equal_to<InputType1>());
+}
+
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/error.inl b/thrust/system/cuda/detail/error.inl
index fd4c679fe..e52305211 100644
--- a/thrust/system/cuda/detail/error.inl
+++ b/thrust/system/cuda/detail/error.inl
@@ -17,11 +17,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/cuda/error.h>
 #include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -39,7 +40,7 @@ error_condition make_error_condition(cuda::errc::errc_t e)
 } // end make_error_condition()
 
 
-namespace cuda
+namespace cuda_cub
 {
 
 namespace detail
@@ -59,16 +60,19 @@ class cuda_error_category
 
     inline virtual std::string message(int ev) const
     {
-      static const std::string unknown_err("Unknown error");
-      const char *c_str = ::cudaGetErrorString(static_cast<cudaError_t>(ev));
-      return c_str ? std::string(c_str) : unknown_err;
+      char const* const unknown_str  = "unknown error";
+      char const* const unknown_name = "cudaErrorUnknown";
+      char const* c_str  = ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+      char const* c_name = ::cudaGetErrorName(static_cast<cudaError_t>(ev));
+      return std::string(c_name ? c_name : unknown_name)
+           + ": " + (c_str ? c_str : unknown_str);
     }
 
     inline virtual error_condition default_error_condition(int ev) const
     {
       using namespace cuda::errc;
 
-      if(ev < ::cudaErrorApiFailureBase)
+      if(ev < ::cudaErrorUnknown)
       {
         return make_error_condition(static_cast<errc_t>(ev));
       }
@@ -79,17 +83,17 @@ class cuda_error_category
 
 } // end detail
 
-} // end namespace cuda
+} // end namespace cuda_cub
 
 
 const error_category &cuda_category(void)
 {
-  static const cuda::detail::cuda_error_category result;
+  static const thrust::system::cuda_cub::detail::cuda_error_category result;
   return result;
 }
 
 
 } // end namespace system
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/execute_on_stream.h b/thrust/system/cuda/detail/execute_on_stream.h
deleted file mode 100644
index b97198174..000000000
--- a/thrust/system/cuda/detail/execute_on_stream.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-__host__ __device__
-inline cudaStream_t legacy_stream()
-{
-#if (CUDA_VERSION < 7000)
-  return 0;
-#else
-  return cudaStreamLegacy;
-#endif
-}
-
-
-__host__ __device__
-inline cudaStream_t default_stream()
-{
-  // XXX we might actually want to use the per-thread default stream instead
-  return legacy_stream();
-}
-
-
-// given any old execution_policy, we return the default stream
-template<typename DerivedPolicy>
-__host__ __device__
-inline cudaStream_t stream(const execution_policy<DerivedPolicy> &exec)
-{
-  return default_stream();
-}
-
-
-// base class for execute_on_stream
-template<typename DerivedPolicy>
-class execute_on_stream_base
-  : public thrust::system::cuda::detail::execution_policy<DerivedPolicy>
-{
-  public:
-    __host__ __device__
-    execute_on_stream_base()
-      : m_stream(default_stream())
-    {}
-
-    __host__ __device__
-    execute_on_stream_base(cudaStream_t stream)
-      : m_stream(stream)
-    {}
-
-    __host__ __device__
-    DerivedPolicy on(const cudaStream_t &s) const
-    {
-      // create a copy of *this to return
-      // make sure it is the derived type
-      DerivedPolicy result = thrust::detail::derived_cast(*this);
-
-      // change the result's stream to s
-      result.set_stream(s);
-
-      return result;
-    }
-
-  private:
-    // stream() is a friend function because we call it through ADL
-    __host__ __device__
-    friend inline cudaStream_t stream(const execute_on_stream_base &exec)
-    {
-      return exec.m_stream;
-    }
-
-    __host__ __device__
-    inline void set_stream(const cudaStream_t &s)
-    {
-      m_stream = s;
-    }
-
-    cudaStream_t m_stream;
-};
-
-
-// execution policy which submits kernel launches on a given stream
-class execute_on_stream
-  : public execute_on_stream_base<execute_on_stream>
-{
-  typedef execute_on_stream_base<execute_on_stream> super_t;
-
-  public:
-    __host__ __device__
-    inline execute_on_stream(cudaStream_t stream = default_stream())
-      : super_t(stream)
-    {}
-};
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/execution_policy.h b/thrust/system/cuda/detail/execution_policy.h
index e0ce1b62c..4202424c5 100644
--- a/thrust/system/cuda/detail/execution_policy.h
+++ b/thrust/system/cuda/detail/execution_policy.h
@@ -1,131 +1,100 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
 
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#include <thrust/version.h>
 #include <thrust/detail/execution_policy.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/iterator/detail/any_system_tag.h>
+#include <thrust/system/cuda/config.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-// put the canonical tag in the same ns as the backend's entry points
-namespace detail
-{
+#include <thrust/detail/allocator_aware_execution_policy.h>
 
-// this awkward sequence of definitions arise
-// from the desire both for tag to derive
-// from execution_policy and for execution_policy
-// to convert to tag (when execution_policy is not
-// an ancestor of tag)
+#if THRUST_CPP_DIALECT >= 2011
+  #include <thrust/detail/dependencies_aware_execution_policy.h>
+#endif
 
-// forward declaration of tag
-struct tag;
+THRUST_NAMESPACE_BEGIN
 
-// forward declaration of execution_policy
-template<typename> struct execution_policy;
+namespace cuda_cub
+{
 
-// specialize execution_policy for tag
-template<>
-  struct execution_policy<tag>
-    : thrust::execution_policy<tag>
-{};
+struct tag;
 
-// tag's definition comes before the
-// generic definition of execution_policy
-struct tag : execution_policy<tag> {};
+template <class>
+struct execution_policy;
 
-// allow conversion to tag when it is not a successor
-template<typename Derived>
-  struct execution_policy
-    : thrust::execution_policy<Derived>
+template <>
+struct execution_policy<tag> : thrust::execution_policy<tag>
 {
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
+  typedef tag tag_type;
 };
 
+struct tag : execution_policy<tag>
+, thrust::detail::allocator_aware_execution_policy<cuda_cub::execution_policy>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<cuda_cub::execution_policy>
+#endif
+{};
 
-template<typename System1, typename System2>
-  struct cross_system
-    : thrust::execution_policy<cross_system<System1,System2> >
+template <class Derived>
+struct execution_policy : thrust::execution_policy<Derived>
 {
-  inline __host__ __device__
-  cross_system(thrust::execution_policy<System1> &system1,
-               thrust::execution_policy<System2> &system2)
-    : system1(system1), system2(system2)
-  {}
-
-  thrust::execution_policy<System1> &system1;
-  thrust::execution_policy<System2> &system2;
-
-  inline __host__ __device__
-  cross_system<System2,System1> rotate() const
-  {
-    return cross_system<System2,System1>(system2,system1);
-  }
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
 };
 
+} // namespace cuda_cub
 
-// overloads of select_system
-
-// cpp interop
-template<typename System1, typename System2>
-inline __host__ __device__
-cross_system<System1,System2> select_system(const execution_policy<System1> &system1, const thrust::cpp::execution_policy<System2> &system2)
+namespace system { namespace cuda { namespace detail
 {
-  thrust::execution_policy<System1> &non_const_system1 = const_cast<execution_policy<System1>&>(system1);
-  thrust::cpp::execution_policy<System2> &non_const_system2 = const_cast<thrust::cpp::execution_policy<System2>&>(system2);
-  return cross_system<System1,System2>(non_const_system1,non_const_system2);
-}
 
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
 
-template<typename System1, typename System2>
-inline __host__ __device__
-cross_system<System1,System2> select_system(const thrust::cpp::execution_policy<System1> &system1, execution_policy<System2> &system2)
-{
-  thrust::cpp::execution_policy<System1> &non_const_system1 = const_cast<thrust::cpp::execution_policy<System1>&>(system1);
-  thrust::execution_policy<System2> &non_const_system2 = const_cast<execution_policy<System2>&>(system2);
-  return cross_system<System1,System2>(non_const_system1,non_const_system2);
-}
-
+}}} // namespace system::cuda::detail
 
-} // end detail
+namespace system { namespace cuda
+{
 
-// alias execution_policy and tag here
-using thrust::system::cuda::detail::execution_policy;
-using thrust::system::cuda::detail::tag;
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
 
-} // end cuda
-} // end system
+}} // namespace system::cuda
 
-// alias items at top-level
 namespace cuda
 {
 
-using thrust::system::cuda::execution_policy;
-using thrust::system::cuda::tag;
+using thrust::cuda_cub::tag;
+using thrust::cuda_cub::execution_policy;
+
+} // namespace cuda
 
-} // end cuda
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/extern_shared_ptr.h b/thrust/system/cuda/detail/extern_shared_ptr.h
deleted file mode 100644
index 1ec3486b9..000000000
--- a/thrust/system/cuda/detail/extern_shared_ptr.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-template<typename T>
-  class extern_shared_ptr
-{
-// don't attempt to compile with any compiler other than nvcc
-// due to use of __shared__ below
-#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
-  public:
-    __device__
-    inline operator T * (void)
-    {
-      extern __shared__ int4 smem[];
-      return reinterpret_cast<T*>(smem);
-    }
-
-    __device__
-    inline operator const T * (void) const
-    {
-      extern __shared__ int4 smem[];
-      return reinterpret_cast<const T*>(smem);
-    }
-#endif // THRUST_DEVICE_COMPILER_NVCC
-}; // end extern_shared_ptr
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/extrema.h b/thrust/system/cuda/detail/extrema.h
index c6ae90664..4fe7ec86b 100644
--- a/thrust/system/cuda/detail/extrema.h
+++ b/thrust/system/cuda/detail/extrema.h
@@ -1,22 +1,548 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/*******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/reduce.h>
+
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __extrema {
+
+  template <class InputType, class IndexType, class Predicate>
+  struct arg_min_f
+  {
+    Predicate predicate;
+    typedef tuple<InputType, IndexType> pair_type;
+
+    __host__ __device__
+    arg_min_f(Predicate p) : predicate(p) {}
+
+    pair_type __device__
+    operator()(pair_type const &lhs, pair_type const &rhs)
+    {
+      InputType const &rhs_value = get<0>(rhs);
+      InputType const &lhs_value = get<0>(lhs);
+      IndexType const &rhs_key   = get<1>(rhs);
+      IndexType const &lhs_key   = get<1>(lhs);
+
+      // check values first
+      if (predicate(lhs_value, rhs_value))
+        return lhs;
+      else if (predicate(rhs_value, lhs_value))
+        return rhs;
+
+      // values are equivalent, prefer smaller index
+      if (lhs_key < rhs_key)
+        return lhs;
+      else
+        return rhs;
+    }
+  };    // struct arg_min_f
+
+  template <class InputType, class IndexType, class Predicate>
+  struct arg_max_f
+  {
+    Predicate predicate;
+    typedef tuple<InputType, IndexType> pair_type;
+
+    __host__ __device__
+    arg_max_f(Predicate p) : predicate(p) {}
+
+    pair_type __device__
+    operator()(pair_type const &lhs, pair_type const &rhs)
+    {
+      InputType const &rhs_value = get<0>(rhs);
+      InputType const &lhs_value = get<0>(lhs);
+      IndexType const &rhs_key   = get<1>(rhs);
+      IndexType const &lhs_key   = get<1>(lhs);
+
+      // check values first
+      if (predicate(lhs_value, rhs_value))
+        return rhs;
+      else if (predicate(rhs_value, lhs_value))
+        return lhs;
+
+      // values are equivalent, prefer smaller index
+      if (lhs_key < rhs_key)
+        return lhs;
+      else
+        return rhs;
+    }
+  };    // struct arg_max_f
+
+  template<class InputType, class IndexType, class Predicate>
+  struct arg_minmax_f
+  {
+    Predicate predicate;
+
+    typedef tuple<InputType, IndexType> pair_type;
+    typedef tuple<pair_type, pair_type> two_pairs_type;
+
+    typedef arg_min_f<InputType, IndexType, Predicate> arg_min_t;
+    typedef arg_max_f<InputType, IndexType, Predicate> arg_max_t;
+
+    __host__ __device__
+    arg_minmax_f(Predicate p) : predicate(p)
+    {
+    }
+
+    two_pairs_type __device__
+    operator()(two_pairs_type const &lhs, two_pairs_type const &rhs)
+    {
+      pair_type const &rhs_min = get<0>(rhs);
+      pair_type const &lhs_min = get<0>(lhs);
+      pair_type const &rhs_max = get<1>(rhs);
+      pair_type const &lhs_max = get<1>(lhs);
+
+      auto result = thrust::make_tuple(arg_min_t(predicate)(lhs_min, rhs_min),
+                                       arg_max_t(predicate)(lhs_max, rhs_max));
+
+      return result;
+    }
+
+    struct duplicate_tuple
+    {
+      __device__ two_pairs_type
+      operator()(pair_type const &t)
+      {
+        return thrust::make_tuple(t, t);
+      }
+    };
+  }; // struct arg_minmax_f
+
+  template <class T,
+            class InputIt,
+            class OutputIt,
+            class Size,
+            class ReductionOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *       d_temp_storage,
+            size_t &     temp_storage_bytes,
+            InputIt      input_it,
+            Size         num_items,
+            ReductionOp  reduction_op,
+            OutputIt     output_it,
+            cudaStream_t stream)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+    using core::get_agent_plan;
+    using core::cuda_optional;
+
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        __reduce::ReduceAgent<InputIt, OutputIt, T, Size, ReductionOp> >
+        reduce_agent;
+
+    typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream);
+
+    cudaError_t status = cudaSuccess;
+
+
+    if (num_items <= reduce_plan.items_per_tile)
+    {
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+
+      // small, single tile size
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = max<size_t>(1, vshmem_size);
+        return status;
+      }
+      char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
+
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only");
+      ra.launch(input_it, output_it, num_items, reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+    else
+    {
+      // regular size
+      cuda_optional<int> sm_count = core::get_sm_count();
+      CUDA_CUB_RET_IF_FAIL(sm_count.status());
+
+      // reduction will not use more cta counts than requested
+      cuda_optional<int> max_blocks_per_sm =
+          reduce_agent::
+              template get_max_blocks_per_sm<InputIt,
+                                             OutputIt,
+                                             Size,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
+                                             ReductionOp>(reduce_plan);
+      CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
+
+
+
+      int reduce_device_occupancy = (int)max_blocks_per_sm * sm_count;
+
+      int sm_oversubscription = 5;
+      int max_blocks          = reduce_device_occupancy * sm_oversubscription;
+
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(num_items, max_blocks,
+                              reduce_plan.items_per_tile);
+
+      // we will launch at most "max_blocks" blocks in a grid
+      // so preallocate virtual shared memory storage for this if required
+      //
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size,
+                                             max_blocks);
+
+      // Temporary storage allocation requirements
+      void * allocations[3] = {NULL, NULL, NULL};
+      size_t allocation_sizes[3] =
+          {
+              max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              vshmem_size                                        // size of virtualized shared memory storage
+          };
+      status = cub::AliasTemporaries(d_temp_storage,
+                                     temp_storage_bytes,
+                                     allocations,
+                                     allocation_sizes);
+      CUDA_CUB_RET_IF_FAIL(status);
+      if (d_temp_storage == NULL)
+      {
+        return status;
+      }
+
+      T *d_block_reductions = (T*) allocations[0];
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
+      char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
+
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = 0;
+      if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE)
+      {
+        // Work is distributed evenly
+        reduce_grid_size = even_share.grid_size;
+      }
+      else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
+      {
+        // Work is distributed dynamically
+        size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile);
+
+        // if not enough to fill the device with threadblocks
+        // then fill the device with threadblocks
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+
+        typedef AgentLauncher<__reduce::DrainAgent<Size> > drain_agent;
+        AgentPlan drain_plan = drain_agent::get_plan();
+        drain_plan.grid_size = 1;
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent");
+        da.launch(queue, num_items);
+        CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      }
+      else
+      {
+        CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported);
+      }
+
+      reduce_plan.grid_size = reduce_grid_size;
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce");
+      ra.launch(input_it,
+                d_block_reductions,
+                num_items,
+                even_share,
+                queue,
+                reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+
+      typedef AgentLauncher<
+        __reduce::ReduceAgent<T*, OutputIt, T, Size, ReductionOp> >
+        reduce_agent_single;
+
+      reduce_plan.grid_size = 1;
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce");
+
+      ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    return status;
+  }    // func doit_step
+
+  // this is an init-less reduce, needed for min/max-element functionality
+  // this will avoid copying the first value from device->host
+  template <typename Derived,
+            typename InputIt,
+            typename Size,
+            typename BinaryOp,
+            typename T>
+  THRUST_RUNTIME_FUNCTION
+  T extrema(execution_policy<Derived>& policy,
+            InputIt                    first,
+            Size                       num_items,
+            BinaryOp                   binary_op,
+            T*)
+  {
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (NULL, temp_storage_bytes, first, num_items_fixed,
+            binary_op, reinterpret_cast<T*>(NULL), stream));
+    cuda_cub::throw_on_error(status, "extrema failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "extrema failed on 1st alias storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "extrema failed on 2nd alias storage");
+
+    T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
+
+    THRUST_INDEX_TYPE_DISPATCH(status, doit_step<T>, num_items,
+        (allocations[1], temp_storage_bytes, first,
+            num_items_fixed, binary_op, d_result, stream));
+    cuda_cub::throw_on_error(status, "extrema failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "extrema failed to synchronize");
+
+    T result = cuda_cub::get_value(policy, d_result);
+
+    return result;
+  }
+
+  template <template <class, class, class> class ArgFunctor,
+            class Derived,
+            class ItemsIt,
+            class BinaryPred>
+  ItemsIt THRUST_RUNTIME_FUNCTION
+  element(execution_policy<Derived> &policy,
+          ItemsIt                    first,
+          ItemsIt                    last,
+          BinaryPred                 binary_pred)
+  {
+    if (first == last)
+      return last;
+
+    typedef typename iterator_traits<ItemsIt>::value_type      InputType;
+    typedef typename iterator_traits<ItemsIt>::difference_type IndexType;
+
+    IndexType num_items = static_cast<IndexType>(thrust::distance(first, last));
+
+    typedef tuple<ItemsIt, counting_iterator_t<IndexType> > iterator_tuple;
+    typedef zip_iterator<iterator_tuple> zip_iterator;
+
+    iterator_tuple iter_tuple = thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+
+
+    typedef ArgFunctor<InputType, IndexType, BinaryPred> arg_min_t;
+    typedef tuple<InputType, IndexType> T;
+
+    zip_iterator begin = make_zip_iterator(iter_tuple);
+
+    T result = extrema(policy,
+                       begin,
+                       num_items,
+                       arg_min_t(binary_pred),
+                       (T *)(NULL));
+    return first + thrust::get<1>(result);
+  }
+
+
+}    // namespace __extrema
+
+/// min element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+ItemsIt __host__ __device__
+min_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            BinaryPred                 binary_pred)
+{
+  THRUST_CDP_DISPATCH(
+    (last = __extrema::element<__extrema::arg_min_f>(policy,
+                                                     first,
+                                                     last,
+                                                     binary_pred);),
+    (last = thrust::min_element(cvt_to_seq(derived_cast(policy)),
+                                first,
+                                last,
+                                binary_pred);));
+  return last;
+}
+
+template <class Derived,
+          class ItemsIt>
+ItemsIt __host__ __device__
+min_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::min_element(policy, first, last, less<value_type>());
+}
+
+/// max element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+ItemsIt __host__ __device__
+max_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            BinaryPred                 binary_pred)
+{
+  THRUST_CDP_DISPATCH(
+    (last = __extrema::element<__extrema::arg_max_f>(policy,
+                                                     first,
+                                                     last,
+                                                     binary_pred);),
+    (last = thrust::max_element(cvt_to_seq(derived_cast(policy)),
+                                first,
+                                last,
+                                binary_pred);));
+  return last;
+}
+
+template <class Derived,
+          class ItemsIt>
+ItemsIt __host__ __device__
+max_element(execution_policy<Derived> &policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::max_element(policy, first, last, less<value_type>());
+}
+
+/// minmax element
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt,
+          class BinaryPred>
+pair<ItemsIt, ItemsIt> __host__ __device__
+minmax_element(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last,
+               BinaryPred                 binary_pred)
+{
+  auto ret = thrust::make_pair(last, last);
+  if (first == last)
+  {
+    return ret;
+  }
+
+  THRUST_CDP_DISPATCH(
+    (using InputType = typename iterator_traits<ItemsIt>::value_type;
+     using IndexType = typename iterator_traits<ItemsIt>::difference_type;
+
+     const auto num_items =
+       static_cast<IndexType>(thrust::distance(first, last));
+
+     using iterator_tuple = tuple<ItemsIt, counting_iterator_t<IndexType>>;
+     using zip_iterator   = zip_iterator<iterator_tuple>;
+
+     iterator_tuple iter_tuple =
+       thrust::make_tuple(first, counting_iterator_t<IndexType>(0));
+
+     using arg_minmax_t =
+       __extrema::arg_minmax_f<InputType, IndexType, BinaryPred>;
+     using two_pairs_type = typename arg_minmax_t::two_pairs_type;
+     using duplicate_t    = typename arg_minmax_t::duplicate_tuple;
+     using transform_t =
+       transform_input_iterator_t<two_pairs_type, zip_iterator, duplicate_t>;
+
+     zip_iterator   begin = make_zip_iterator(iter_tuple);
+     two_pairs_type result =
+       __extrema::extrema(policy,
+                          transform_t(begin, duplicate_t()),
+                          num_items,
+                          arg_minmax_t(binary_pred),
+                          (two_pairs_type *)(NULL));
+     ret = thrust::make_pair(first + get<1>(get<0>(result)),
+                             first + get<1>(get<1>(result)));),
+    // CDP Sequential impl:
+    (ret = thrust::minmax_element(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  binary_pred);));
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt>
+pair<ItemsIt, ItemsIt> __host__ __device__
+minmax_element(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last)
+{
+  typedef typename iterator_value<ItemsIt>::type value_type;
+  return cuda_cub::minmax_element(policy, first, last, less<value_type>());
+}
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/fill.h b/thrust/system/cuda/detail/fill.h
index 0bcda4a0e..80ea68592 100644
--- a/thrust/system/cuda/detail/fill.h
+++ b/thrust/system/cuda/detail/fill.h
@@ -1,22 +1,90 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __fill {
+
+  // fill functor
+  template<class Iterator, class T>
+  struct functor
+  {
+    Iterator it;
+    T value;
+
+    THRUST_FUNCTION
+    functor(Iterator it, T value)
+        : it(it), value(value) {}
+
+    template<class Size>
+    THRUST_DEVICE_FUNCTION void operator()(Size idx)
+    {
+      it[idx] = value;
+    }
+  }; // struct functor
+
+}    // namespace __fill
+
+template <class Derived, class OutputIterator, class Size, class T>
+OutputIterator __host__ __device__
+fill_n(execution_policy<Derived>& policy,
+       OutputIterator             first,
+       Size                       count,
+       const T&                   value)
+{
+  cuda_cub::parallel_for(policy,
+                         __fill::functor<OutputIterator, T>(
+                         first,
+                         value),
+                         count);
+
+  return first + count;
+}    // func fill_n
+
+template <class Derived, class ForwardIterator, class T>
+void __host__ __device__
+fill(execution_policy<Derived>& policy,
+     ForwardIterator            first,
+     ForwardIterator            last,
+     const T&                   value)
+{
+  cuda_cub::fill_n(policy, first, thrust::distance(first,last), value);
+} // func filll
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/find.h b/thrust/system/cuda/detail/find.h
index c6ae90664..b7d2b748f 100644
--- a/thrust/system/cuda/detail/find.h
+++ b/thrust/system/cuda/detail/find.h
@@ -1,22 +1,218 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+// XXX forward declare to circumvent circular depedency
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if(execution_policy<Derived>& policy,
+        InputIt                    first,
+        InputIt                    last,
+        Predicate                  predicate);
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if_not(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            Predicate                  predicate);
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+find(execution_policy<Derived> &policy,
+     InputIt                    first,
+     InputIt                    last,
+     T const& value);
+
+}; // namespace cuda_cub
+THRUST_NAMESPACE_END
+
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/iterator/zip_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __find_if {
+
+  template <typename TupleType>
+  struct functor
+  {
+    THRUST_DEVICE_FUNCTION TupleType
+    operator()(const TupleType& lhs, const TupleType& rhs) const
+    {
+      // select the smallest index among true results
+      if (thrust::get<0>(lhs) && thrust::get<0>(rhs))
+      {
+        return TupleType(true, (thrust::min)(thrust::get<1>(lhs), thrust::get<1>(rhs)));
+      }
+      else if (thrust::get<0>(lhs))
+      {
+        return lhs;
+      }
+      else
+      {
+        return rhs;
+      }
+    }
+  };
+}    // namespace __find_if
+
+template <class Derived,
+          class InputIt,
+          class Size,
+          class Predicate>
+InputIt __host__ __device__
+find_if_n(execution_policy<Derived>& policy,
+          InputIt                    first,
+          Size                       num_items,
+          Predicate                  predicate)
+{
+  typedef typename thrust::tuple<bool,Size> result_type;
+  
+  // empty sequence
+  if(num_items == 0) return first;
+  
+  // this implementation breaks up the sequence into separate intervals
+  // in an attempt to early-out as soon as a value is found
+  //
+  // XXX compose find_if from a look-back prefix scan algorithm
+  //     and abort kernel when the first element is found
+
+
+  // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
+  const Size interval_threshold = 1 << 20;
+  const Size interval_size = (thrust::min)(interval_threshold, num_items);
+  
+  // force transform_iterator output to bool
+  typedef transform_input_iterator_t<bool,
+                                     InputIt,
+                                     Predicate>
+      XfrmIterator;
+  typedef thrust::tuple<XfrmIterator,
+                        counting_iterator_t<Size> >
+      IteratorTuple;
+  typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
+
+  IteratorTuple iter_tuple =
+      thrust::make_tuple(XfrmIterator(first, predicate),
+                         counting_iterator_t<Size>(0));
+
+  ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
+  ZipIterator end   = begin + num_items;
+
+  for (ZipIterator interval_begin = begin;
+       interval_begin < end;
+       interval_begin += interval_size)
+  {
+    ZipIterator interval_end = interval_begin + interval_size;
+    if(end < interval_end)
+    {
+      interval_end = end;
+    } // end if
+
+    result_type result = reduce(policy,
+                                interval_begin,
+                                interval_end,
+                                result_type(false, interval_end - begin),
+                                __find_if::functor<result_type>());
+
+    // see if we found something
+    if(thrust::get<0>(result))
+    {
+      return first + thrust::get<1>(result);
+    }
+  }
+  
+  //nothing was found if we reach here...
+  return first + num_items;
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if(execution_policy<Derived>& policy,
+        InputIt                    first,
+        InputIt                    last,
+        Predicate                  predicate)
+{
+  return cuda_cub::find_if_n(policy, first, thrust::distance(first,last), predicate);
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+find_if_not(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            Predicate                  predicate)
+{
+  return cuda_cub::find_if(policy, first, last, thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+find(execution_policy<Derived> &policy,
+     InputIt                    first,
+     InputIt                    last,
+     T const& value)
+{
+  using thrust::placeholders::_1;
+
+  return cuda_cub::find_if(policy,
+                        first,
+                        last,
+                        _1 == value);
+}
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/for_each.h b/thrust/system/cuda/detail/for_each.h
index 52af8af65..518538ff3 100644
--- a/thrust/system/cuda/detail/for_each.h
+++ b/thrust/system/cuda/detail/for_each.h
@@ -1,65 +1,104 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
+#pragma once
 
+#include <thrust/detail/config.h>
 
-/*! \file for_each.h
- *  \brief Defines the interface for a function that executes a 
- *  function or functional for each value in a given range.
- */
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/config.h>
 
-#pragma once
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/detail/function.h>
+#include <thrust/distance.h>
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+  // for_each functor
+  template <class Input, class UnaryOp>
+  struct for_each_f
+  {
+    Input input;
+    UnaryOp op;
 
+    THRUST_FUNCTION
+    for_each_f(Input input, UnaryOp op)
+        : input(input), op(op) {}
 
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename UnaryFunction>
-__host__ __device__
-RandomAccessIterator for_each(execution_policy<DerivedPolicy> &s,
-                              RandomAccessIterator first,
-                              RandomAccessIterator last,
-                              UnaryFunction f);
+    template <class Size>
+    THRUST_DEVICE_FUNCTION void operator()(Size idx)
+    {
+      op(raw_reference_cast(*(input + idx)));
+    }
+  };
 
+  //-------------------------
+  // Thrust API entry points
+  //-------------------------
 
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-__host__ __device__
-RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &s,
-                                RandomAccessIterator first,
-                                Size n,
-                                UnaryFunction f);
+  // for_each_n
+  template <class Derived,
+            class Input,
+            class Size,
+            class UnaryOp>
+  Input THRUST_FUNCTION
+  for_each_n(execution_policy<Derived> &policy,
+             Input                      first,
+             Size                       count,
+             UnaryOp                    op)
+  {
+    typedef thrust::detail::wrapped_function<UnaryOp, void> wrapped_t;
+    wrapped_t wrapped_op(op);
 
+    cuda_cub::parallel_for(policy,
+                           for_each_f<Input, wrapped_t>(first, wrapped_op),
+                           count);
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+    return first + count;
+  }
 
-#include <thrust/system/cuda/detail/for_each.inl>
+  // for_each
+  template <class Derived,
+            class Input,
+            class UnaryOp>
+  Input THRUST_FUNCTION
+  for_each(execution_policy<Derived> &policy,
+           Input                      first,
+           Input                      last,
+           UnaryOp                    op)
+  {
+    typedef typename iterator_traits<Input>::difference_type size_type;
+    size_type count = static_cast<size_type>(thrust::distance(first,last));
+    return cuda_cub::for_each_n(policy, first,  count, op);
+  }
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/for_each.inl b/thrust/system/cuda/detail/for_each.inl
deleted file mode 100644
index 1536994f5..000000000
--- a/thrust/system/cuda/detail/for_each.inl
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/for_each.h>
-#include <thrust/distance.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/function.h>
-#include <thrust/detail/seq.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace for_each_n_detail
-{
-
-
-struct for_each_kernel
-{
-  template<typename Iterator, typename Function, typename Size>
-  __host__ __device__
-  void operator()(bulk_::parallel_group<bulk_::concurrent_group<> > &grid, Iterator first, Function f, Size n)
-  {
-    Size grid_size = grid.size() * grid.this_exec.size();
-
-    Size i = grid.this_exec.index() * grid.this_exec.size() + grid.this_exec.this_exec.index();
-
-    first += i;
-
-    while(i < n)
-    {
-      f(*first);
-      i += grid_size;
-      first += grid_size;
-    }
-  }
-};
-
-
-template<typename Size>
-__host__ __device__
-bool use_wide_counter(Size n, unsigned int narrow_grid_size)
-{
-  // use the wide counter when n will not fit within an unsigned int
-  // or if incrementing an unsigned int by narrow_grid_size would overflow
-  // the counter
-  Size threshold = static_cast<Size>(UINT_MAX);
-
-  bool result = (sizeof(Size) > sizeof(unsigned int)) && (n > threshold);
-
-  if(!result)
-  {
-    // check if we'd overflow the little closure's counter
-    unsigned int narrow_n = static_cast<unsigned int>(n);
-
-    if((narrow_n - 1u) + narrow_grid_size < narrow_n)
-    {
-      result = true;
-    }
-  }
-
-  return result;
-}
-
-
-} // end for_each_n_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename Size,
-         typename UnaryFunction>
-__host__ __device__
-RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator first,
-                                Size n,
-                                UnaryFunction f)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator parallel_path(execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, Size n, UnaryFunction f)
-    {
-      thrust::detail::wrapped_function<UnaryFunction,void> wrapped_f(f);
-
-      // opportunistically narrow the type of n
-
-      unsigned int narrow_n = static_cast<unsigned int>(n);
-      unsigned int narrow_num_groups = 0;
-      unsigned int narrow_group_size = 0;
-
-      // automatically choose a number of groups and a group size
-      thrust::tie(narrow_num_groups, narrow_group_size) = bulk_::choose_sizes(bulk_::grid(), for_each_n_detail::for_each_kernel(), bulk_::root, first, wrapped_f, narrow_n);
-
-      // do we need to use the wider type?
-      if(for_each_n_detail::use_wide_counter(n, narrow_num_groups * narrow_group_size))
-      {
-        Size num_groups = 0;
-        Size group_size = 0;
-        thrust::tie(num_groups, group_size) = bulk_::choose_sizes(bulk_::grid(), for_each_n_detail::for_each_kernel(), bulk_::root, first, wrapped_f, n);
-
-        num_groups = thrust::min<Size>(num_groups, thrust::detail::util::divide_ri(n, group_size));
-
-        bulk_::async(bulk_::grid(num_groups,group_size,0,stream(thrust::detail::derived_cast(exec))), for_each_n_detail::for_each_kernel(), bulk_::root, first, wrapped_f, n);
-      }
-      else
-      {
-        // we can use the narrower type for n
-        narrow_num_groups = thrust::min<unsigned int>(narrow_num_groups, thrust::detail::util::divide_ri(narrow_n, narrow_group_size));
-
-        bulk_::async(bulk_::grid(narrow_num_groups,narrow_group_size,0,stream(thrust::detail::derived_cast(exec))), for_each_n_detail::for_each_kernel(), bulk_::root, first, wrapped_f, narrow_n);
-      }
-
-      return first + n;
-    }
-
-    __host__ __device__
-    static RandomAccessIterator sequential_path(execution_policy<DerivedPolicy> &, RandomAccessIterator first, Size n, UnaryFunction f)
-    {
-      return thrust::for_each_n(thrust::seq, first, n, f);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, n, f);
-#else
-  return workaround::sequential_path(exec, first, n, f);
-#endif
-} 
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename UnaryFunction>
-__host__ __device__
-InputIterator for_each(execution_policy<DerivedPolicy> &exec,
-                       InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
-{
-  return cuda::detail::for_each_n(exec, first, thrust::distance(first,last), f);
-} // end for_each()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/future.inl b/thrust/system/cuda/detail/future.inl
new file mode 100644
index 000000000..f23184aae
--- /dev/null
+++ b/thrust/system/cuda/detail/future.inl
@@ -0,0 +1,1372 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+// TODO: Split into more granular headers (move unique_stream/unique_marker to
+// another header, etc).
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/optional.h>
+#include <thrust/detail/type_deduction.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/type_traits/remove_cvref.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/detail/tuple_algorithms.h>
+#include <thrust/allocate_unique.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/execute_with_dependencies.h>
+#include <thrust/detail/event_error.h>
+#include <thrust/system/cuda/memory.h>
+#include <thrust/system/cuda/future.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/get_value.h>
+
+#include <type_traits>
+#include <thrust/detail/memory_wrapper.h>
+
+THRUST_NAMESPACE_BEGIN
+
+// Forward declaration.
+struct new_stream_t;
+
+namespace system { namespace cuda { namespace detail
+{
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct nonowning_t final {};
+
+THRUST_INLINE_CONSTANT nonowning_t nonowning{};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct marker_deleter final
+{
+  __host__
+  void operator()(CUevent_st* e) const
+  {
+    if (nullptr != e)
+      thrust::cuda_cub::throw_on_error(cudaEventDestroy(e));
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct unique_marker final
+{
+  using native_handle_type = CUevent_st*;
+
+private:
+  std::unique_ptr<CUevent_st, marker_deleter> handle_;
+
+public:
+  /// \brief Create a new stream and construct a handle to it. When the handle
+  ///        is destroyed, the stream is destroyed.
+  __host__
+  unique_marker()
+    : handle_(nullptr, marker_deleter())
+  {
+    native_handle_type e;
+    thrust::cuda_cub::throw_on_error(
+      cudaEventCreateWithFlags(&e, cudaEventDisableTiming)
+    );
+    handle_.reset(e);
+  }
+
+  __thrust_exec_check_disable__
+  unique_marker(unique_marker const&) = delete;
+  __thrust_exec_check_disable__
+  unique_marker(unique_marker&&) = default;
+  __thrust_exec_check_disable__
+  unique_marker& operator=(unique_marker const&) = delete;
+  __thrust_exec_check_disable__
+  unique_marker& operator=(unique_marker&&) = default;
+
+  __thrust_exec_check_disable__
+  ~unique_marker() = default;
+
+  __host__
+  auto get() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+  __host__
+  auto native_handle() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+
+  __host__
+  bool valid() const noexcept { return bool(handle_); }
+
+  __host__
+  bool ready() const
+  {
+    cudaError_t const err = cudaEventQuery(handle_.get());
+
+    if (cudaErrorNotReady == err)
+      return false;
+
+    // Throw on any other error.
+    thrust::cuda_cub::throw_on_error(err);
+
+    return true;
+  }
+
+  __host__
+  void wait() const
+  {
+    thrust::cuda_cub::throw_on_error(cudaEventSynchronize(handle_.get()));
+  }
+
+  __host__
+  bool operator==(unique_marker const& other) const
+  {
+    return other.handle_ == handle_;
+  }
+
+  __host__
+  bool operator!=(unique_marker const& other) const
+  {
+    return !(other == *this);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct stream_deleter final
+{
+  __host__
+  void operator()(CUstream_st* s) const
+  {
+    if (nullptr != s)
+      thrust::cuda_cub::throw_on_error(cudaStreamDestroy(s));
+  }
+};
+
+struct stream_conditional_deleter final
+{
+private:
+  bool cond_;
+
+public:
+  __host__
+  constexpr stream_conditional_deleter() noexcept
+    : cond_(true) {}
+
+  __host__
+  explicit constexpr stream_conditional_deleter(nonowning_t) noexcept
+    : cond_(false) {}
+
+  __host__
+  void operator()(CUstream_st* s) const
+  {
+    if (cond_ && nullptr != s)
+    {
+      thrust::cuda_cub::throw_on_error(cudaStreamDestroy(s));
+    }
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct unique_stream final
+{
+  using native_handle_type = CUstream_st*;
+
+private:
+  std::unique_ptr<CUstream_st, stream_conditional_deleter> handle_;
+
+public:
+  /// \brief Create a new stream and construct a handle to it. When the handle
+  ///        is destroyed, the stream is destroyed.
+  __host__
+  unique_stream()
+    : handle_(nullptr, stream_conditional_deleter())
+  {
+    native_handle_type s;
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking)
+    );
+    handle_.reset(s);
+  }
+
+  /// \brief Construct a non-owning handle to an existing stream. When the
+  ///        handle is destroyed, the stream is not destroyed.
+  __host__
+  explicit unique_stream(nonowning_t, native_handle_type handle)
+    : handle_(handle, stream_conditional_deleter(nonowning))
+  {}
+
+  __thrust_exec_check_disable__
+  unique_stream(unique_stream const&) = delete;
+
+  // GCC 10 complains if this is defaulted. See NVIDIA/thrust#1269.
+  __thrust_exec_check_disable__
+  __host__ unique_stream(unique_stream &&o) noexcept
+    : handle_(std::move(o.handle_))
+  {}
+
+  __thrust_exec_check_disable__
+  unique_stream& operator=(unique_stream const&) = delete;
+  __thrust_exec_check_disable__
+  unique_stream& operator=(unique_stream&&) = default;
+
+  __thrust_exec_check_disable__
+  ~unique_stream() = default;
+
+  __host__
+  auto get() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+  __host__
+  auto native_handle() const
+  THRUST_DECLTYPE_RETURNS(native_handle_type(handle_.get()));
+
+  __host__
+  bool valid() const noexcept { return bool(handle_); }
+
+  __host__
+  bool ready() const
+  {
+    cudaError_t const err = cudaStreamQuery(handle_.get());
+
+    if (cudaErrorNotReady == err)
+      return false;
+
+    // Throw on any other error.
+    thrust::cuda_cub::throw_on_error(err);
+
+    return true;
+  }
+
+  __host__
+  void wait() const
+  {
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamSynchronize(handle_.get())
+    );
+  }
+
+  __host__
+  void depend_on(unique_marker& e)
+  {
+    thrust::cuda_cub::throw_on_error(
+      cudaStreamWaitEvent(handle_.get(), e.get(), 0)
+    );
+  }
+
+  __host__
+  void depend_on(unique_stream& s)
+  {
+    if (s != *this)
+    {
+      unique_marker e;
+      s.record(e);
+      depend_on(e);
+    }
+  }
+
+  __host__
+  void record(unique_marker& e)
+  {
+    thrust::cuda_cub::throw_on_error(cudaEventRecord(e.get(), handle_.get()));
+  }
+
+  __host__
+  bool operator==(unique_stream const& other) const
+  {
+    return other.handle_ == handle_;
+  }
+
+  __host__
+  bool operator!=(unique_stream const& other) const
+  {
+    return !(other == *this);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Inheritance hierarchy of future/event shared state types.
+
+struct async_signal;
+
+template <typename KeepAlives>
+struct async_keep_alives /* : virtual async_signal */;
+
+template <typename T>
+struct async_value /* : virtual async_signal */;
+
+template <typename T, typename Pointer, typename KeepAlives>
+struct async_addressable_value_with_keep_alives
+/* : async_value<T>, async_keep_alives<KeepAlives> */;
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Pointer>
+struct weak_promise;
+
+template <typename X, typename XPointer = pointer<X>>
+struct unique_eager_future_promise_pair final
+{
+  unique_eager_future<X>    future;
+  weak_promise<X, XPointer> promise;
+};
+
+struct acquired_stream final
+{
+  unique_stream stream;
+  optional<std::size_t> const acquired_from;
+  // `acquired_from` contains the index in the tuple of dependencies from which
+  // the stream was acquired. If `acquired_from` is empty, no stream could be
+  // acquired from a dependency, and then the stream was newly created.
+};
+
+// Precondition: `device` is the current CUDA device.
+template <typename X, typename Y, typename Deleter>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, std::unique_ptr<Y, Deleter>&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, unique_stream& stream) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, ready_event&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+template <typename X>
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, ready_future<X>&) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_event& parent) noexcept;
+
+// Precondition: `device` is the current CUDA device.
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device, unique_eager_future<X>& parent) noexcept;
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream(int device, Dependencies&... deps) noexcept;
+  
+template <typename... Dependencies>
+__host__
+unique_eager_event
+make_dependent_event(
+  std::tuple<Dependencies...>&& deps
+);
+
+template <
+  typename X, typename XPointer
+, typename ComputeContent, typename... Dependencies
+>
+__host__
+unique_eager_future_promise_pair<X, XPointer>
+make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps);
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct async_signal
+{
+protected:
+  unique_stream stream_;
+
+public:
+  // Constructs an `async_signal` which uses `stream`.
+  __host__
+  explicit async_signal(unique_stream&& stream)
+    : stream_(std::move(stream))
+  {}
+
+  __host__
+  virtual ~async_signal() {}
+
+  unique_stream&       stream()       noexcept { return stream_; }
+  unique_stream const& stream() const noexcept { return stream_; }
+};
+
+template <typename... KeepAlives>
+struct async_keep_alives<std::tuple<KeepAlives...>> : virtual async_signal
+{
+  using keep_alives_type = std::tuple<KeepAlives...>;
+
+protected:
+  keep_alives_type keep_alives_;
+
+public:
+  // Constructs an `async_keep_alives` which uses `stream`, and keeps the
+  // objects in the tuple `keep_alives` alive until the asynchronous signal is
+  // destroyed.
+  __host__
+  explicit async_keep_alives(
+    unique_stream&& stream, keep_alives_type&& keep_alives
+  )
+    : async_signal(std::move(stream))
+    , keep_alives_(std::move(keep_alives))
+  {}
+
+  __host__
+  virtual ~async_keep_alives() {}
+};
+
+template <typename T>
+struct async_value : virtual async_signal
+{
+  using value_type        = T;
+  using raw_const_pointer = value_type const*;
+
+  // Constructs an `async_value` which uses `stream` and has no content.
+  __host__
+  explicit async_value(unique_stream stream)
+    : async_signal(std::move(stream))
+  {}
+
+  __host__
+  virtual ~async_value() {}
+
+  __host__
+  virtual bool valid_content() const noexcept { return false; }
+
+  __host__
+  virtual value_type get()
+  {
+    throw thrust::event_error(event_errc::no_state);
+  }
+
+  __host__
+  virtual value_type extract()
+  {
+    throw thrust::event_error(event_errc::no_state);
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  __host__
+  virtual raw_const_pointer raw_data() const
+  {
+    return nullptr;
+  }
+  #endif
+};
+
+template <typename T, typename Pointer, typename... KeepAlives>
+struct async_addressable_value_with_keep_alives<
+  T, Pointer, std::tuple<KeepAlives...>
+> final
+  : async_value<T>, async_keep_alives<std::tuple<KeepAlives...>>
+{
+  using value_type        = typename async_value<T>::value_type;
+  using raw_const_pointer = typename async_value<T>::raw_const_pointer;
+
+  using keep_alives_type
+    = typename async_keep_alives<std::tuple<KeepAlives...>>::keep_alives_type;
+
+  using pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<value_type>::other;
+  using const_pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<value_type const>::other;
+
+private:
+  pointer content_;
+
+public:
+  // Constructs an `async_addressable_value_with_keep_alives` which uses
+  // `stream`, keeps the objects in the tuple `keep_alives` alive until the
+  // asynchronous value is destroyed, and determines the location of its
+  // content by evaluating `compute_content(content_keep_alive)`.
+  // NOTE: The use of a callback idiom is necessary if the content is stored in
+  // place in the content keep alive object, in which case we need to get its
+  // address after its been moved into the new signal we're constructing.
+  // NOTE: NVCC has a bug that causes it to reorder our base class initializers
+  // in generated host code, which leads to -Wreorder warnings.
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_BEGIN
+  template <typename ComputeContent>
+  __host__
+  explicit async_addressable_value_with_keep_alives(
+    unique_stream&&    stream
+  , keep_alives_type&& keep_alives
+  , ComputeContent&&   compute_content
+  )
+    : async_signal(std::move(stream))
+    , async_value<T>(std::move(stream))
+    , async_keep_alives<keep_alives_type>(
+        std::move(stream), std::move(keep_alives)
+      )
+  {
+    content_ = THRUST_FWD(compute_content)(std::get<0>(this->keep_alives_));
+  }
+  THRUST_DISABLE_CLANG_AND_GCC_INITIALIZER_REORDERING_WARNING_END
+
+  __host__
+  bool valid_content() const noexcept final override
+  {
+    return nullptr != content_;
+  }
+
+  // Precondition: `true == valid_content()`.
+  __host__
+  pointer data() 
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return content_;
+  }
+
+  // Precondition: `true == valid_content()`.
+  __host__
+  const_pointer data() const 
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return content_;
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type get() final override
+  {
+    this->stream().wait();
+    return *data();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type extract() final override
+  {
+    this->stream().wait();
+    return std::move(*data());
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  __host__
+  raw_const_pointer raw_data() const final override
+  {
+    return raw_pointer_cast(content_);
+  }
+  #endif
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, typename Pointer>
+struct weak_promise final
+{
+  using value_type = typename async_value<T>::value_type;
+
+  using pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<T>::other;
+  using const_pointer
+    = typename thrust::detail::pointer_traits<Pointer>::template
+      rebind<T const>::other;
+
+private:
+  int device_ = 0;
+  pointer content_;
+
+  explicit weak_promise(int device_id, pointer content)
+    : device_(device_id), content_(std::move(content))
+  {}
+
+public:
+  __host__ __device__
+  weak_promise() : device_(0), content_{} {}
+
+  __thrust_exec_check_disable__
+  weak_promise(weak_promise const&) = default;
+  __thrust_exec_check_disable__
+  weak_promise(weak_promise&&) = default;
+  __thrust_exec_check_disable__
+  weak_promise& operator=(weak_promise const&) = default;
+  __thrust_exec_check_disable__
+  weak_promise& operator=(weak_promise&&) = default;
+
+  template <typename U>
+  __host__ __device__
+  void set_value(U&& value) &&
+  {
+    *content_ = THRUST_FWD(value);
+  }
+
+  template <
+    typename X, typename XPointer
+  , typename ComputeContent, typename... Dependencies
+  >
+  friend __host__
+  unique_eager_future_promise_pair<X, XPointer>
+  thrust::system::cuda::detail::make_dependent_future(
+    ComputeContent&& cc, std::tuple<Dependencies...>&& deps
+  );
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+struct ready_event final
+{
+  ready_event() = default;
+
+  template <typename U>
+  __host__ __device__
+  explicit ready_event(ready_future<U>) {}
+
+  __host__ __device__
+  static constexpr bool valid_content() noexcept { return true; }
+
+  __host__ __device__
+  static constexpr bool ready() noexcept { return true; }
+};
+
+template <typename T>
+struct ready_future final
+{
+  using value_type        = T;
+  using raw_const_pointer = T const*;
+
+private:
+  value_type value_;
+
+public:
+  __host__ __device__
+  ready_future() : value_{} {}
+
+  ready_future(ready_future&&) = default;
+  ready_future(ready_future const&) = default;
+  ready_future& operator=(ready_future&&) = default;
+  ready_future& operator=(ready_future const&) = default;
+
+  template <typename U>
+  __host__ __device__
+  explicit ready_future(U&& u) : value_(THRUST_FWD(u)) {}
+
+  __host__ __device__
+  static constexpr bool valid_content() noexcept { return true; }
+
+  __host__ __device__
+  static constexpr bool ready() noexcept { return true; }
+
+  __host__ __device__
+  value_type get() const
+  {
+    return value_;
+  }
+
+  THRUST_NODISCARD __host__ __device__
+  value_type extract() 
+  {
+    return std::move(value_);
+  }
+
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  // For testing only.
+  __host__ __device__
+  raw_const_pointer data() const
+  {
+    return addressof(value_);
+  }
+  #endif
+};
+
+struct unique_eager_event final
+{
+protected:
+  int device_ = 0;
+  std::unique_ptr<detail::async_signal> async_signal_;
+
+  __host__
+  explicit unique_eager_event(
+    int device_id, std::unique_ptr<detail::async_signal> async_signal
+  )
+    : device_(device_id), async_signal_(std::move(async_signal))
+  {}
+
+public:
+  __host__
+  unique_eager_event()
+    : device_(0), async_signal_()
+  {}
+
+  unique_eager_event(unique_eager_event&&) = default;
+  unique_eager_event(unique_eager_event const&) = delete;
+  unique_eager_event& operator=(unique_eager_event&&) = default;
+  unique_eager_event& operator=(unique_eager_event const&) = delete;
+
+  // Any `unique_eager_future<T>` can be explicitly converted to a
+  // `unique_eager_event<void>`.
+  template <typename U>
+  __host__
+  explicit unique_eager_event(unique_eager_future<U>&& other)
+    // NOTE: We upcast to `unique_ptr<async_signal>` here.
+    : device_(other.where()), async_signal_(std::move(other.async_signal_))
+  {}
+
+  __host__
+  // NOTE: We take `new_stream_t` by `const&` because it is incomplete here.
+  explicit unique_eager_event(new_stream_t const&)
+    : device_(0)
+    , async_signal_(new detail::async_signal(detail::unique_stream{}))
+  {
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_));
+  }
+
+  __host__
+  virtual ~unique_eager_event()
+  {
+    // FIXME: If we could asynchronously handle destruction of keep alives, we
+    // could avoid doing this.
+    if (valid_stream()) wait();
+  }
+
+  __host__
+  bool valid_stream() const noexcept
+  {
+    return bool(async_signal_);
+  }
+
+  __host__
+  bool ready() const noexcept
+  {
+    if (valid_stream())
+      return stream().ready();
+    else
+      return false;
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  detail::unique_stream& stream()
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+  detail::unique_stream const& stream() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+
+  __host__
+  int where() const noexcept { return device_; }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  void wait()
+  {
+    stream().wait();
+  }
+
+  friend __host__
+  optional<detail::unique_stream>
+  thrust::system::cuda::detail::try_acquire_stream(
+    int device_id, unique_eager_event& parent
+    ) noexcept;
+
+  template <typename... Dependencies>
+  friend __host__
+  unique_eager_event
+  thrust::system::cuda::detail::make_dependent_event(
+    std::tuple<Dependencies...>&& deps
+  );
+};
+
+template <typename T>
+struct unique_eager_future final
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (!std::is_same<T, remove_cvref_t<void>>::value)
+  , "`thrust::event` should be used to express valueless futures"
+  );
+
+  using value_type        = typename detail::async_value<T>::value_type;
+  using raw_const_pointer = typename detail::async_value<T>::raw_const_pointer;
+
+private:
+  int device_ = 0;
+  std::unique_ptr<detail::async_value<value_type>> async_signal_;
+
+  __host__
+  explicit unique_eager_future(
+    int device_id, std::unique_ptr<detail::async_value<value_type>> async_signal
+  )
+    : device_(device_id), async_signal_(std::move(async_signal))
+  {}
+
+public:
+  __host__
+  unique_eager_future()
+    : device_(0), async_signal_()
+  {}
+
+  unique_eager_future(unique_eager_future&&) = default;
+  unique_eager_future(unique_eager_future const&) = delete;
+  unique_eager_future& operator=(unique_eager_future&&) = default;
+  unique_eager_future& operator=(unique_eager_future const&) = delete;
+
+  __host__
+  // NOTE: We take `new_stream_t` by `const&` because it is incomplete here.
+  explicit unique_eager_future(new_stream_t const&)
+    : device_(0)
+    , async_signal_(new detail::async_value<value_type>(detail::unique_stream{}))
+  {
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_));
+  }
+
+  __host__
+  ~unique_eager_future()
+  {
+    // FIXME: If we could asynchronously handle destruction of keep alives, we
+    // could avoid doing this.
+    if (valid_stream()) wait();
+  }
+
+  __host__
+  bool valid_stream() const noexcept
+  {
+    return bool(async_signal_);
+  }
+
+  __host__
+  bool valid_content() const noexcept
+  {
+    if (!valid_stream())
+      return false;
+
+    // We might have been constructed with `new_stream_t`, in which case we'd
+    // have an async_value, but it doesn't have content.
+    return async_signal_->valid_content();
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  bool ready() const noexcept
+  {
+    if (valid_stream())
+      return stream().ready();
+    else
+      return false;
+  }
+
+  // Precondition: `true == valid_stream()`.
+  __host__
+  detail::unique_stream& stream()
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+  __host__
+  detail::unique_stream const& stream() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->stream();
+  }
+
+  __host__
+  int where() const noexcept { return device_; }
+
+  // Blocks.
+  // Precondition: `true == valid_stream()`.
+  __host__
+  void wait()
+  {
+    stream().wait();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  __host__
+  value_type get()
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    return async_signal_->get();
+  }
+
+  // Blocks.
+  // Precondition: `true == valid_content()`.
+  THRUST_NODISCARD __host__
+  value_type extract()
+  {
+    if (!valid_content())
+      throw thrust::event_error(event_errc::no_content);
+
+    value_type tmp(async_signal_->extract());
+    async_signal_.reset();
+    return tmp;
+  }
+
+  // For testing only.
+  #if defined(THRUST_ENABLE_FUTURE_RAW_DATA_MEMBER)
+  // Precondition: `true == valid_stream()`.
+  __host__
+  raw_const_pointer raw_data() const
+  {
+    if (!valid_stream())
+      throw thrust::event_error(event_errc::no_state);
+
+    return async_signal_->raw_data();
+  }
+  #endif
+
+  template <typename X>
+  friend __host__
+  optional<detail::unique_stream>
+  thrust::system::cuda::detail::try_acquire_stream(
+    int device_id, unique_eager_future<X>& parent
+    ) noexcept;
+
+  template <
+    typename X, typename XPointer
+  , typename ComputeContent, typename... Dependencies
+  >
+  friend __host__
+  detail::unique_eager_future_promise_pair<X, XPointer>
+  thrust::system::cuda::detail::make_dependent_future(
+    ComputeContent&& cc, std::tuple<Dependencies...>&& deps
+  );
+
+  friend struct unique_eager_event;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template <typename X, typename Deleter>
+__host__
+optional<unique_stream>
+try_acquire_stream(int, std::unique_ptr<X, Deleter>&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, unique_stream& stream) noexcept
+{
+  return {std::move(stream)};
+}
+
+inline __host__
+optional<unique_stream>
+try_acquire_stream(int, ready_event&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int, ready_future<X>&) noexcept
+{
+  // There's no stream to acquire!
+  return {};
+}
+
+__host__
+optional<unique_stream>
+try_acquire_stream(int device_id, unique_eager_event& parent) noexcept
+{
+  // We have unique ownership, so we can always steal the stream if the future
+  // has one as long as they are on the same device as us.
+  if (parent.valid_stream())
+    if (device_id == parent.device_)
+      return std::move(parent.async_signal_->stream());
+
+  return {};
+}
+
+template <typename X>
+__host__
+optional<unique_stream>
+try_acquire_stream(int device_id, unique_eager_future<X>& parent) noexcept
+{
+  // We have unique ownership, so we can always steal the stream if the future
+  // has one as long as they are on the same device as us.
+  if (parent.valid_stream())
+    if (device_id == parent.device_)
+      return std::move(parent.async_signal_->stream());
+
+  return {};
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream_impl(
+  int, std::tuple<Dependencies...>&, index_sequence<>
+) noexcept
+{
+  // We tried to take a stream from all of our dependencies and failed every
+  // time, so we need to make a new stream.
+  return {unique_stream{}, {}};
+}
+
+template <typename... Dependencies, std::size_t I0, std::size_t... Is>
+__host__
+acquired_stream acquire_stream_impl(
+  int device_id
+, std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>
+) noexcept
+{
+  auto tr = try_acquire_stream(device_id, std::get<I0>(deps));
+
+  if (tr)
+    return {std::move(*tr), {I0}};
+  else
+    return acquire_stream_impl(device_id, deps, index_sequence<Is...>{});
+}
+
+template <typename... Dependencies>
+__host__
+acquired_stream acquire_stream(
+  int device_id
+, std::tuple<Dependencies...>& deps
+) noexcept
+{
+  return acquire_stream_impl(
+    device_id, deps, make_index_sequence<sizeof...(Dependencies)>{}
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename X, typename Deleter>
+__host__
+void create_dependency(
+  unique_stream&, std::unique_ptr<X, Deleter>&
+) noexcept
+{}
+
+inline __host__
+void create_dependency(
+  unique_stream&, ready_event&
+) noexcept
+{}
+
+template <typename T>
+__host__
+void create_dependency(
+  unique_stream&, ready_future<T>&
+) noexcept
+{}
+
+inline __host__
+void create_dependency(
+  unique_stream& child, unique_stream& parent
+)
+{
+  child.depend_on(parent);
+}
+
+inline __host__
+void create_dependency(
+  unique_stream& child, unique_eager_event& parent
+)
+{
+  child.depend_on(parent.stream());
+}
+
+template <typename X>
+__host__
+void create_dependency(
+  unique_stream& child, unique_eager_future<X>& parent
+)
+{
+  child.depend_on(parent.stream());
+}
+
+template <typename... Dependencies>
+__host__
+void create_dependencies_impl(
+  acquired_stream&
+, std::tuple<Dependencies...>&, index_sequence<>
+)
+{}
+
+template <typename... Dependencies, std::size_t I0, std::size_t... Is>
+__host__
+void create_dependencies_impl(
+  acquired_stream& as
+, std::tuple<Dependencies...>& deps, index_sequence<I0, Is...>
+)
+{
+  // We only need to wait on the current dependency if we didn't steal our
+  // stream from it.
+  if (!as.acquired_from || *as.acquired_from != I0)
+  {
+    create_dependency(as.stream, std::get<I0>(deps));
+  }
+
+  create_dependencies_impl(as, deps, index_sequence<Is...>{});
+}
+
+template <typename... Dependencies>
+__host__
+void create_dependencies(acquired_stream& as, std::tuple<Dependencies...>& deps)
+{
+  create_dependencies_impl(
+    as, deps, make_index_sequence<sizeof...(Dependencies)>{}
+  );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+// Metafunction that determine which `Dependencies` need to be kept alive.
+// Returns the result as an `index_sequence` of indices into the parameter
+// pack.
+template <typename Tuple, typename Indices>
+  struct find_keep_alives_impl;
+template <typename Tuple>
+  using find_keep_alives
+    = typename find_keep_alives_impl<
+        Tuple, make_index_sequence<std::tuple_size<Tuple>::value>
+      >::type;
+
+template <>
+struct find_keep_alives_impl<
+  std::tuple<>, index_sequence<>
+>
+{
+  using type = index_sequence<>;
+};
+
+// User-provided stream.
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_stream, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Nothing to keep alive, skip this index.
+  using type = typename find_keep_alives_impl<
+    std::tuple<Dependencies...>, index_sequence<Is...>
+  >::type;
+};
+
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<ready_event, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Nothing to keep alive, skip this index.
+  using type = typename find_keep_alives_impl<
+    std::tuple<Dependencies...>, index_sequence<Is...>
+  >::type;
+};
+
+template <
+  typename T, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<ready_future<T>, Dependencies...>, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+template <
+  typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_eager_event, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+template <
+  typename X, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<unique_eager_future<X>, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+// Content storage.
+template <
+  typename T, typename Deleter, typename... Dependencies
+, std::size_t I0, std::size_t... Is
+>
+struct find_keep_alives_impl<
+  std::tuple<std::unique_ptr<T, Deleter>, Dependencies...>
+, index_sequence<I0, Is...>
+>
+{
+  // Add this index to the list.
+  using type = integer_sequence_push_front<
+    std::size_t, I0
+  , typename find_keep_alives_impl<
+      std::tuple<Dependencies...>, index_sequence<Is...>
+    >::type
+  >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Dependencies>
+__host__
+unique_eager_event make_dependent_event(std::tuple<Dependencies...>&& deps)
+{
+  int device_id = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
+
+  // First, either steal a stream from one of our children or make a new one.
+  auto as = acquire_stream(device_id, deps);
+
+  // Then, make the stream we've acquired asynchronously wait on all of our
+  // dependencies, except the one we stole the stream from.
+  create_dependencies(as, deps);
+
+  // Then, we determine which subset of dependencies need to be kept alive.
+  auto ka = tuple_subset(
+    std::move(deps)
+  , find_keep_alives<std::tuple<Dependencies...>>{}
+  );
+
+  // Next, we create the asynchronous signal.
+  using async_signal_type = async_keep_alives<decltype(ka)>;
+
+  std::unique_ptr<async_signal_type> sig(
+    new async_signal_type(std::move(as.stream), std::move(ka))
+  );
+
+  // Finally, we create the event object.
+  return unique_eager_event(device_id, std::move(sig));
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <
+  typename X, typename XPointer
+, typename ComputeContent, typename... Dependencies
+>
+__host__
+unique_eager_future_promise_pair<X, XPointer>
+make_dependent_future(ComputeContent&& cc, std::tuple<Dependencies...>&& deps)
+{
+  int device_id = 0;
+  thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
+
+  // First, either steal a stream from one of our children or make a new one.
+  auto as = acquire_stream(device_id, deps);
+
+  // Then, make the stream we've acquired asynchronously wait on all of our
+  // dependencies, except the one we stole the stream from.
+  create_dependencies(as, deps);
+
+  // Then, we determine which subset of dependencies need to be kept alive.
+  auto ka = tuple_subset(
+    std::move(deps)
+  , find_keep_alives<std::tuple<Dependencies...>>{}
+  );
+
+  // Next, we create the asynchronous value.
+  using async_signal_type = async_addressable_value_with_keep_alives<
+    X, XPointer, decltype(ka)
+  >;
+
+  std::unique_ptr<async_signal_type> sig(
+    new async_signal_type(std::move(as.stream), std::move(ka), std::move(cc))
+  );
+ 
+  // Finally, we create the promise and future objects.
+  weak_promise<X, XPointer> child_prom(device_id, sig->data());
+  unique_eager_future<X> child_fut(device_id, std::move(sig));
+
+  return unique_eager_future_promise_pair<X, XPointer>
+    {std::move(child_fut), std::move(child_prom)};
+}
+
+} // namespace detail
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename... Events>
+__host__
+unique_eager_event when_all(Events&&... evs)
+// TODO: Constrain to events, futures, and maybe streams (currently allows keep
+// alives).
+{
+  return detail::make_dependent_event(std::make_tuple(std::move(evs)...)); 
+}
+
+// ADL hook for transparent `.after` move support.
+inline __host__
+auto capture_as_dependency(unique_eager_event& dependency)
+THRUST_DECLTYPE_RETURNS(std::move(dependency))
+
+// ADL hook for transparent `.after` move support.
+template <typename X>
+__host__
+auto capture_as_dependency(unique_eager_future<X>& dependency)
+THRUST_DECLTYPE_RETURNS(std::move(dependency))
+
+}} // namespace system::cuda
+
+THRUST_NAMESPACE_END
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/detail/gather.h b/thrust/system/cuda/detail/gather.h
index c6ae90664..56ff3aecf 100644
--- a/thrust/system/cuda/detail/gather.h
+++ b/thrust/system/cuda/detail/gather.h
@@ -1,22 +1,107 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class MapIt,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+gather(execution_policy<Derived>& policy,
+    MapIt map_first,
+    MapIt map_last,
+    ItemsIt items,
+    ResultIt result)
+{
+  return cuda_cub::transform(policy,
+                          thrust::make_permutation_iterator(items, map_first),
+                          thrust::make_permutation_iterator(items, map_last),
+                          result,
+                          identity());
+}
+
+
+template <class Derived,
+          class MapIt,
+          class StencilIt,
+          class ItemsIt,
+          class ResultIt,
+          class Predicate>
+ResultIt __host__ __device__
+gather_if(execution_policy<Derived>& policy,
+          MapIt                      map_first,
+          MapIt                      map_last,
+          StencilIt                  stencil,
+          ItemsIt                    items,
+          ResultIt                   result,
+          Predicate                  predicate)
+{
+  return cuda_cub::transform_if(policy,
+                              thrust::make_permutation_iterator(items, map_first),
+                              thrust::make_permutation_iterator(items, map_last),
+                              stencil,
+                              result,
+                              identity(),
+                              predicate);
+}
+
+template <class Derived,
+          class MapIt,
+          class StencilIt,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+gather_if(execution_policy<Derived>& policy,
+          MapIt                      map_first,
+          MapIt                      map_last,
+          StencilIt                  stencil,
+          ItemsIt                    items,
+          ResultIt                   result)
+{
+  return cuda_cub::gather_if(policy,
+                          map_first,
+                          map_last,
+                          stencil,
+                          items,
+                          result,
+                          identity());
+}
+
+
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
 
+#endif
diff --git a/thrust/system/cuda/detail/generate.h b/thrust/system/cuda/detail/generate.h
index c6ae90664..ad6340f83 100644
--- a/thrust/system/cuda/detail/generate.h
+++ b/thrust/system/cuda/detail/generate.h
@@ -1,22 +1,90 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+// for_each functor
+template <class Generator>
+struct generate_f
+{
+  Generator generator;
+
+  THRUST_FUNCTION
+  generate_f(Generator generator_) : generator(generator_) {}
+
+  template<class T>
+  THRUST_DEVICE_FUNCTION void operator()(T const& value)
+  {
+    T & lvalue = const_cast<T&>(value);
+    lvalue = generator();
+  }
+};
+
+// for_each_n
+template <class Derived,
+          class OutputIt,
+          class Size,
+          class Generator>
+OutputIt __host__ __device__
+generate_n(execution_policy<Derived> &policy,
+           OutputIt                   result,
+           Size                       count,
+           Generator                  generator)
+{
+  return cuda_cub::for_each_n(policy,
+                              result,
+                              count,
+                              generate_f<Generator>(generator));
+}
+
+  // for_each
+template <class Derived,
+          class OutputIt,
+          class Generator>
+void __host__ __device__
+generate(execution_policy<Derived> &policy,
+         OutputIt                   first,
+         OutputIt                   last,
+         Generator                  generator)
+{
+  cuda_cub::generate_n(policy, first, thrust::distance(first, last), generator);
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/get_value.h b/thrust/system/cuda/detail/get_value.h
index a30bc77e6..9065f773a 100644
--- a/thrust/system/cuda/detail/get_value.h
+++ b/thrust/system/cuda/detail/get_value.h
@@ -17,19 +17,17 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/assign_value.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cross_system.h>
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
 
 
 namespace
@@ -66,14 +64,10 @@ inline __host__ __device__
     }
   };
 
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(exec, ptr);
-#else
-  return war_nvbugs_881631::device_path(exec, ptr);
-#endif // __CUDA_ARCH__
+  NV_IF_TARGET(NV_IS_HOST,
+               (return war_nvbugs_881631::host_path(exec, ptr);),
+               (return war_nvbugs_881631::device_path(exec, ptr);))
 } // end get_value_msvc2005_war()
-
-
 } // end anon namespace
 
 
@@ -86,8 +80,7 @@ inline __host__ __device__
 } // end get_value()
 
 
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
+} // end cuda_cub
+THRUST_NAMESPACE_END
 
+#endif
diff --git a/thrust/system/cuda/detail/inner_product.h b/thrust/system/cuda/detail/inner_product.h
index c6ae90664..98e9064d2 100644
--- a/thrust/system/cuda/detail/inner_product.h
+++ b/thrust/system/cuda/detail/inner_product.h
@@ -1,22 +1,94 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class T,
+          class ReduceOp,
+          class ProductOp>
+T __host__ __device__
+inner_product(execution_policy<Derived> &policy,
+              InputIt1                   first1,
+              InputIt1                   last1,
+              InputIt2                   first2,
+              T                          init,
+              ReduceOp                   reduce_op,
+              ProductOp                  product_op)
+{
+  typedef typename iterator_traits<InputIt1>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+  typedef transform_pair_of_input_iterators_t<T,
+                                              InputIt1,
+                                              InputIt2,
+                                              ProductOp>
+      binop_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            binop_iterator_t(first1, first2, product_op),
+                            num_items,
+                            init,
+                            reduce_op);
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class T>
+T __host__ __device__
+inner_product(execution_policy<Derived> &policy,
+              InputIt1                   first1,
+              InputIt1                   last1,
+              InputIt2                   first2,
+              T                          init)
+{
+  return cuda_cub::inner_product(policy,
+                                 first1,
+                                 last1,
+                                 first2,
+                                 init,
+                                 plus<T>(),
+                                 multiplies<T>());
+}
+
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/internal/copy_cross_system.h b/thrust/system/cuda/detail/internal/copy_cross_system.h
new file mode 100644
index 000000000..a1208c67c
--- /dev/null
+++ b/thrust/system/cuda/detail/internal/copy_cross_system.h
@@ -0,0 +1,244 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditionu and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+// XXX
+// this file must not be included on its own, ever,
+// but must be part of include in thrust/system/cuda/detail/copy.h
+
+#include <thrust/detail/config.h>
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/distance.h>
+#include <thrust/advance.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __copy {
+
+
+  template <class H,
+            class D,
+            class T,
+            class Size>
+  THRUST_HOST_FUNCTION void
+  trivial_device_copy(thrust::cpp::execution_policy<H>&      ,
+                      thrust::cuda_cub::execution_policy<D>& device_s,
+                      T*                                     dst,
+                      T const*                               src,
+                      Size                                   count)
+  {
+    cudaError status;
+    status = cuda_cub::trivial_copy_to_device(dst,
+                                              src,
+                                              count,
+                                              cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy::trivial_device_copy H->D: failed");
+  }
+
+  template <class D,
+            class H,
+            class T,
+            class Size>
+  THRUST_HOST_FUNCTION void
+  trivial_device_copy(thrust::cuda_cub::execution_policy<D>& device_s,
+                      thrust::cpp::execution_policy<H>&      ,
+                      T*                                     dst,
+                      T const*                               src,
+                      Size                                   count)
+  {
+    cudaError status;
+    status = cuda_cub::trivial_copy_from_device(dst,
+                                                src,
+                                                count,
+                                                cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "trivial_device_copy D->H failed");
+  }
+
+  template <class System1,
+            class System2,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::execution_policy<System1>& sys1,
+                      thrust::execution_policy<System2>& sys2,
+                      InputIt                            begin,
+                      Size                               n,
+                      OutputIt                           result,
+                      thrust::detail::true_type)    // trivial copy
+
+  {
+    typedef typename iterator_traits<InputIt>::value_type InputTy;
+    if (n > 0) {
+      trivial_device_copy(derived_cast(sys1),
+                          derived_cast(sys2),
+                          reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+                          reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*begin)),
+                          n);
+    }
+
+    return result + n;
+  }
+
+  // non-trivial H->D copy
+  template <class H,
+            class D,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::cpp::execution_policy<H>&      host_s,
+                      thrust::cuda_cub::execution_policy<D>& device_s,
+                      InputIt                                first,
+                      Size                                   num_items,
+                      OutputIt                               result,
+                      thrust::detail::false_type)    // non-trivial copy
+  {
+    // get type of the input data
+    typedef typename thrust::iterator_value<InputIt>::type InputTy;
+
+    // copy input data into host temp storage
+    InputIt last = first;
+    thrust::advance(last, num_items);
+    thrust::detail::temporary_array<InputTy, H> temp(host_s, num_items);
+
+    for (Size idx = 0; idx != num_items; idx++)
+    {
+      ::new (static_cast<void*>(temp.data().get()+idx)) InputTy(*first);
+      ++first;
+    }
+
+    // allocate device temporary storage
+    thrust::detail::temporary_array<InputTy, D> d_in_ptr(device_s, num_items);
+
+    // trivial copy data from host to device
+    cudaError status = cuda_cub::trivial_copy_to_device(d_in_ptr.data().get(),
+                                                        temp.data().get(),
+                                                        num_items,
+                                                        cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy:: H->D: failed");
+
+
+    // device->device copy
+    OutputIt ret = cuda_cub::copy_n(device_s, d_in_ptr.data(), num_items, result);
+
+    return ret;
+  }
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+  // non-trivial copy D->H, only supported with NVCC compiler
+  // because copy ctor must have  __device__ annotations, which is nvcc-only
+  // feature
+  template <class D,
+            class H,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(thrust::cuda_cub::execution_policy<D>& device_s,
+                      thrust::cpp::execution_policy<H>&   host_s,
+                      InputIt                             first,
+                      Size                                num_items,
+                      OutputIt                            result,
+                      thrust::detail::false_type)    // non-trivial copy
+
+  {
+    // get type of the input data
+    typedef typename thrust::iterator_value<InputIt>::type InputTy;
+
+    // allocate device temp storage 
+    thrust::detail::temporary_array<InputTy, D> d_in_ptr(device_s, num_items);
+
+    // uninitialize copy into temp device storage
+    cuda_cub::uninitialized_copy_n(device_s, first, num_items, d_in_ptr.data());
+
+    // allocate host temp storage
+    thrust::detail::temporary_array<InputTy, H> temp(host_s, num_items);
+
+    // trivial copy from device to host
+    cudaError status;
+    status = cuda_cub::trivial_copy_from_device(temp.data().get(),
+                                                d_in_ptr.data().get(),
+                                                num_items,
+                                                cuda_cub::stream(device_s));
+    cuda_cub::throw_on_error(status, "__copy:: D->H: failed");
+
+    // host->host copy
+    OutputIt ret = thrust::copy_n(host_s, temp.data(), num_items, result);
+
+    return ret;
+  }
+#endif
+
+  template <class System1,
+            class System2,
+            class InputIt,
+            class Size,
+            class OutputIt>
+  OutputIt __host__
+  cross_system_copy_n(cross_system<System1, System2> systems,
+                      InputIt  begin,
+                      Size     n,
+                      OutputIt result)
+  {
+    return cross_system_copy_n(
+        derived_cast(systems.sys1),
+        derived_cast(systems.sys2),
+        begin,
+        n,
+        result,
+        typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+  }
+
+  template <class System1,
+            class System2,
+            class InputIterator,
+            class OutputIterator>
+  OutputIterator __host__
+  cross_system_copy(cross_system<System1, System2> systems,
+                    InputIterator  begin,
+                    InputIterator  end,
+                    OutputIterator result)
+  {
+    return cross_system_copy_n(systems,
+                               begin,
+                               thrust::distance(begin, end),
+                               result);
+  }
+
+}    // namespace __copy
+
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/system/cuda/detail/internal/copy_device_to_device.h
new file mode 100644
index 000000000..69c4e20df
--- /dev/null
+++ b/thrust/system/cuda/detail/internal/copy_device_to_device.h
@@ -0,0 +1,64 @@
+
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/functional.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __copy {
+
+  template <class Derived,
+            class InputIt,
+            class OutputIt>
+  OutputIt THRUST_RUNTIME_FUNCTION
+  device_to_device(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   OutputIt                   result)
+  {
+    typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+    return cuda_cub::transform(policy,
+                            first,
+                            last,
+                            result,
+                            thrust::identity<InputTy>());
+  }
+
+}    // namespace __copy
+
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/iter_swap.h b/thrust/system/cuda/detail/iter_swap.h
index 75030112e..c0628610a 100644
--- a/thrust/system/cuda/detail/iter_swap.h
+++ b/thrust/system/cuda/detail/iter_swap.h
@@ -17,22 +17,24 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
 #include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/swap.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
 
 
-template<typename Pointer1, typename Pointer2>
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 inline __host__ __device__
-void iter_swap(tag, Pointer1 a, Pointer2 b)
+void iter_swap(thrust::cuda::execution_policy<DerivedPolicy> &, Pointer1 a, Pointer2 b)
 {
   // XXX war nvbugs/881631
   struct war_nvbugs_881631
@@ -50,16 +52,15 @@ void iter_swap(tag, Pointer1 a, Pointer2 b)
     }
   };
 
-#ifndef __CUDA_ARCH__
-  return war_nvbugs_881631::host_path(a,b);
-#else
-  return war_nvbugs_881631::device_path(a,b);
-#endif // __CUDA_ARCH__
-} // end iter_swap()
+  NV_IF_TARGET(NV_IS_HOST, (
+    war_nvbugs_881631::host_path(a, b);
+  ), (
+    war_nvbugs_881631::device_path(a, b);
+  ));
 
+} // end iter_swap()
 
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
 
+} // end cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/default_decomposition.h b/thrust/system/cuda/detail/make_unsigned_special.h
similarity index 51%
rename from thrust/system/cuda/detail/default_decomposition.h
rename to thrust/system/cuda/detail/make_unsigned_special.h
index d95558c09..dda735767 100644
--- a/thrust/system/cuda/detail/default_decomposition.h
+++ b/thrust/system/cuda/detail/make_unsigned_special.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2019 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,35 +14,30 @@
  *  limitations under the License.
  */
 
-
-/*! \file default_decomposition.h
- *  \brief Return a decomposition that is appropriate for the CUDA backend.
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/detail/internal/decompose.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace detail {
 
+    template<typename Size>
+    struct make_unsigned_special;
 
-template<typename IndexType>
-__host__ __device__
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n);
+    template<>
+    struct make_unsigned_special<int> { typedef unsigned int type; };
 
+    // this is special, because CUDA's atomicAdd doesn't have an overload
+    // for unsigned long, for some godforsaken reason
+    template<>
+    struct make_unsigned_special<long> { typedef unsigned long long type; };
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+    template<>
+    struct make_unsigned_special<long long> { typedef unsigned long long type; };
 
-#include <thrust/system/cuda/detail/default_decomposition.inl>
+}
+}
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/malloc_and_free.h b/thrust/system/cuda/detail/malloc_and_free.h
index 0ad97225c..1b12e2cc3 100644
--- a/thrust/system/cuda/detail/malloc_and_free.h
+++ b/thrust/system/cuda/detail/malloc_and_free.h
@@ -16,26 +16,36 @@
 
 #pragma once
 
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system/system_error.h>
-#include <thrust/system/cuda/error.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/seq.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/util.h>
 #include <thrust/system/detail/bad_alloc.h>
-#include <thrust/system/cuda/detail/throw_on_error.h>
 #include <thrust/detail/malloc_and_free.h>
-#include <thrust/detail/seq.h>
 
+#ifdef THRUST_CACHING_DEVICE_MALLOC
+#include <cub/util_allocator.cuh>
+#endif
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+#ifdef THRUST_CACHING_DEVICE_MALLOC
+#define __CUB_CACHING_MALLOC
+#ifndef __CUDA_ARCH__
+inline cub::CachingDeviceAllocator &get_allocator()
 {
+  static cub::CachingDeviceAllocator g_allocator(true);
+  return g_allocator;
+}
+#endif
+#endif
 
 
 // note that malloc returns a raw pointer to avoid
@@ -46,16 +56,34 @@ void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 {
   void *result = 0;
 
-#ifndef __CUDA_ARCH__
-  // XXX use cudaMalloc in __device__ code when it becomes available
-  cudaError_t error = cudaMalloc(reinterpret_cast<void**>(&result), n);
-
-  if(error)
-  {
-    throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(error).c_str());
-  } // end if
-#else
-  result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+  // need to repeat a lot of code here because we can't use #if inside of the
+  // NV_IF_TARGET macro.
+  // The device path is the same either way, but the host allocations differ.
+#ifdef __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cub::CachingDeviceAllocator &alloc = get_allocator();
+    cudaError_t status = alloc.DeviceAllocate(&result, n);
+
+    if (status != cudaSuccess)
+    {
+      cudaGetLastError(); // Clear global CUDA error state.
+      throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+    }
+  ), ( // NV_IS_DEVICE
+    result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+  ));
+#else // not __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cudaError_t status = cudaMalloc(&result, n);
+
+    if (status != cudaSuccess)
+    {
+      cudaGetLastError(); // Clear global CUDA error state.
+      throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+    }
+  ), ( // NV_IS_DEVICE
+    result = thrust::raw_pointer_cast(thrust::malloc(thrust::seq, n));
+  ));
 #endif
 
   return result;
@@ -66,17 +94,26 @@ template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
 void free(execution_policy<DerivedPolicy> &, Pointer ptr)
 {
-#ifndef __CUDA_ARCH__
-  // XXX use cudaFree in __device__ code when it becomes available
-  throw_on_error(cudaFree(thrust::raw_pointer_cast(ptr)), "cudaFree in free");
-#else
-  thrust::free(thrust::seq, ptr);
+  // need to repeat a lot of code here because we can't use #if inside of the
+  // NV_IF_TARGET macro.
+  // The device path is the same either way, but the host deallocations differ.
+#ifdef __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cub::CachingDeviceAllocator &alloc = get_allocator();
+    cudaError_t status = alloc.DeviceFree(thrust::raw_pointer_cast(ptr));
+    cuda_cub::throw_on_error(status, "device free failed");
+  ), ( // NV_IS_DEVICE
+    thrust::free(thrust::seq, ptr);
+  ));
+#else // not __CUB_CACHING_MALLOC
+  NV_IF_TARGET(NV_IS_HOST, (
+    cudaError_t status = cudaFree(thrust::raw_pointer_cast(ptr));
+    cuda_cub::throw_on_error(status, "device free failed");
+  ), ( // NV_IS_DEVICE
+    thrust::free(thrust::seq, ptr);
+  ));
 #endif
 } // end free()
 
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/memory.inl b/thrust/system/cuda/detail/memory.inl
index 371d38dbb..f6fc98359 100644
--- a/thrust/system/cuda/detail/memory.inl
+++ b/thrust/system/cuda/detail/memory.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,73 +14,29 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cuda/memory.h>
 #include <thrust/system/cuda/detail/malloc_and_free.h>
 #include <limits>
 
-namespace thrust
-{
-
-// XXX WAR an issue with MSVC 2005 (cl v14.00) incorrectly implementing
-//     pointer_raw_pointer for pointer by specializing it here
-//     note that we specialize it here, before the use of raw_pointer_cast
-//     below, which causes pointer_raw_pointer's instantiation
-#if (THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC) && (_MSC_VER <= 1400)
-namespace detail
-{
-
-template<typename T>
-  struct pointer_raw_pointer< thrust::cuda::pointer<T> >
-{
-  typedef typename thrust::cuda::pointer<T>::raw_pointer type;
-}; // end pointer_raw_pointer
-
-} // end detail
-#endif
-
-namespace system
-{
-namespace cuda
-{
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
 {
-  a.swap(b);
-} // end swap()
 
 __host__ __device__
 pointer<void> malloc(std::size_t n)
 {
   tag cuda_tag;
-  return pointer<void>(thrust::system::cuda::detail::malloc(cuda_tag, n));
+  return pointer<void>(thrust::cuda_cub::malloc(cuda_tag, n));
 } // end malloc()
 
 template<typename T>
 __host__ __device__
 pointer<T> malloc(std::size_t n)
 {
-  pointer<void> raw_ptr = thrust::system::cuda::malloc(sizeof(T) * n);
+  pointer<void> raw_ptr = thrust::cuda_cub::malloc(sizeof(T) * n);
   return pointer<T>(reinterpret_cast<T*>(raw_ptr.get()));
 } // end malloc()
 
@@ -88,10 +44,9 @@ __host__ __device__
 void free(pointer<void> ptr)
 {
   tag cuda_tag;
-  return thrust::system::cuda::detail::free(cuda_tag, ptr.get());
+  return thrust::cuda_cub::free(cuda_tag, ptr.get());
 } // end free()
 
-} // end cuda
-} // end system
-} // end thrust
+} // end cuda_cub
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/merge.h b/thrust/system/cuda/detail/merge.h
index 42fbf9bf2..478e3508d 100644
--- a/thrust/system/cuda/detail/merge.h
+++ b/thrust/system/cuda/detail/merge.h
@@ -1,53 +1,998 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+j * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/merge.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
 
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __merge {
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class BinaryPred>
+  Size THRUST_DEVICE_FUNCTION
+  merge_path(KeysIt1    keys1,
+             KeysIt2    keys2,
+             Size       keys1_count,
+             Size       keys2_count,
+             Size       diag,
+             BinaryPred binary_pred)
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type key2_type;
+
+    Size keys1_begin = thrust::max<Size>(0, diag - keys2_count);
+    Size keys1_end   = thrust::min<Size>(diag, keys1_count);
+
+    while (keys1_begin < keys1_end)
+    {
+      Size mid = (keys1_begin + keys1_end) >> 1;
+      key1_type key1 = keys1[mid];
+      key2_type key2 = keys2[diag - 1 - mid];
+      bool pred = binary_pred(key2, key1);
+      if (pred)
+      {
+        keys1_end = mid;
+      }
+      else
+      {
+        keys1_begin = mid+1;
+      }
+    }
+    return keys1_begin;
+  }
+
+  template <class It, class T2, class CompareOp, int ITEMS_PER_THREAD>
+  THRUST_DEVICE_FUNCTION void
+  serial_merge(It  keys_shared,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T2 (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+  {
+    int keys1_end = keys1_beg + keys1_count;
+    int keys2_end = keys2_beg + keys2_count;
+
+    typedef typename iterator_value<It>::type key_type;
+
+    key_type key1 = keys_shared[keys1_beg];
+    key_type key2 = keys_shared[keys2_beg];
+
+
+#pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+      bool p = (keys2_beg < keys2_end) &&
+               ((keys1_beg >= keys1_end) ||
+                compare_op(key2,key1));
+
+      output[ITEM]  = p ? key2 : key1;
+      indices[ITEM] = p ? keys2_beg++ : keys1_beg++;
+
+      if (p)
+      {
+        key2 = keys_shared[keys2_beg];
+      }
+      else
+      {
+        key1 = keys_shared[keys1_beg];
+      }
+    }
+  }
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockStoreAlgorithm _STORE_ALGORITHM  = cub::BLOCK_STORE_DIRECT>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockStoreAlgorithm STORE_ALGORITHM = _STORE_ALGORITHM;
+  };    // PtxPolicy
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class CompareOp>
+  struct PartitionAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<256> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    THRUST_AGENT_ENTRY(KeysIt1   keys1,
+                       KeysIt2   keys2,
+                       Size      keys1_count,
+                       Size      keys2_count,
+                       Size      num_partitions,
+                       Size*     merge_partitions,
+                       CompareOp compare_op,
+                       int       items_per_tile,
+                       char*     /*shmem*/)
+    {
+      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      if (partition_idx < num_partitions)
+      {
+        Size partition_at = (thrust::min)(partition_idx * items_per_tile,
+                                        keys1_count + keys2_count);
+        Size partition_diag = merge_path(keys1,
+                                         keys2,
+                                         keys1_count,
+                                         keys2_count,
+                                         partition_at,
+                                         compare_op);
+        merge_partitions[partition_idx] = partition_diag;
+      }
+    }
+  };    // struct PartitionAgent
+
+
+  template <class Arch, class TSize>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<int NOMINAL_4B_ITEMS_PER_THREAD, size_t INPUT_SIZE>
+  struct items_per_thread
+  {
+    enum
+    {
+      ITEMS_PER_THREAD =
+          mpl::min<
+              int,
+              NOMINAL_4B_ITEMS_PER_THREAD,
+              mpl::max<
+                  int,
+                  1,
+                  static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 / INPUT_SIZE)>::value>::value,
+      value = mpl::is_odd<int, ITEMS_PER_THREAD>::value
+                  ? ITEMS_PER_THREAD
+                  : ITEMS_PER_THREAD + 1
+    };
+  };
+
+  template<class TSize>
+  struct Tuning<sm30,TSize>
+  {
+    const static int INPUT_SIZE = TSize::value;
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          INPUT_SIZE>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm300
+
+
+
+  template<class TSize>
+  struct Tuning<sm60,TSize> : Tuning<sm30,TSize>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm52
+
+  template<class TSize>
+  struct Tuning<sm52,TSize> : Tuning<sm30,TSize>
+  {
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 13,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm52
+
+  template<class TSize>
+  struct Tuning<sm35,TSize> : Tuning<sm30,TSize>
+  {
+    const static int INPUT_SIZE = TSize::value;
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      ITEMS_PER_THREAD            = items_per_thread<NOMINAL_4B_ITEMS_PER_THREAD,
+                                          Tuning::INPUT_SIZE>::value
+    };
+
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_STORE_WARP_TRANSPOSE>
+        type;
+  };    // Tuning sm350
+
+
+  template<size_t VALUE>
+  struct integer_constant : thrust::detail::integral_constant<size_t, VALUE> {};
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class ItemsIt1,
+            class ItemsIt2,
+            class Size,
+            class KeysOutputIt,
+            class ItemsOutputIt,
+            class CompareOp,
+            class MERGE_ITEMS>
+  struct MergeAgent
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type  key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type  key2_type;
+    typedef typename iterator_traits<ItemsIt1>::value_type item1_type;
+    typedef typename iterator_traits<ItemsIt2>::value_type item2_type;
+
+    typedef key1_type  key_type;
+    typedef item1_type item_type;
+
+    typedef typename thrust::detail::conditional<
+        MERGE_ITEMS::value,
+        integer_constant<sizeof(key_type) + sizeof(item_type)>,
+        integer_constant<sizeof(key_type)> >::type tuning_type;
+
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, tuning_type>::type
+    {
+      typedef Tuning<Arch,tuning_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt1>::type  KeysLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, KeysIt2>::type  KeysLoadIt2;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt1>::type ItemsLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt2>::type ItemsLoadIt2;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type  BlockLoadKeys1;
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type  BlockLoadKeys2;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt1>::type BlockLoadItems1;
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt2>::type BlockLoadItems2;
+
+      typedef typename core::BlockStore<PtxPlan,
+                                        KeysOutputIt,
+                                        key_type>::type BlockStoreKeys;
+      typedef typename core::BlockStore<PtxPlan,
+                                        ItemsOutputIt,
+                                        item_type>::type BlockStoreItems;
+
+      // gather required temporary storage in a union
+      //
+      union TempStorage
+      {
+        typename BlockLoadKeys1::TempStorage  load_keys1;
+        typename BlockLoadKeys2::TempStorage  load_keys2;
+        typename BlockLoadItems1::TempStorage load_items1;
+        typename BlockLoadItems2::TempStorage load_items2;
+        typename BlockStoreKeys::TempStorage  store_keys;
+        typename BlockStoreItems::TempStorage store_items;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE + 1> items_shared;
+        core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE + 1>  keys_shared;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt1     KeysLoadIt1;
+    typedef typename ptx_plan::KeysLoadIt2     KeysLoadIt2;
+    typedef typename ptx_plan::ItemsLoadIt1    ItemsLoadIt1;
+    typedef typename ptx_plan::ItemsLoadIt2    ItemsLoadIt2;
+    typedef typename ptx_plan::BlockLoadKeys1  BlockLoadKeys1;
+    typedef typename ptx_plan::BlockLoadKeys2  BlockLoadKeys2;
+    typedef typename ptx_plan::BlockLoadItems1 BlockLoadItems1;
+    typedef typename ptx_plan::BlockLoadItems2 BlockLoadItems2;
+    typedef typename ptx_plan::BlockStoreKeys  BlockStoreKeys;
+    typedef typename ptx_plan::BlockStoreItems BlockStoreItems;
+    typedef typename ptx_plan::TempStorage     TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage&  storage;
+      KeysLoadIt1   keys1_in;
+      KeysLoadIt2   keys2_in;
+      ItemsLoadIt1  items1_in;
+      ItemsLoadIt2  items2_in;
+      Size          keys1_count;
+      Size          keys2_count;
+      KeysOutputIt  keys_out;
+      ItemsOutputIt items_out;
+      CompareOp     compare_op;
+      Size*         merge_partitions;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE, class T, class It1, class It2>
+      THRUST_DEVICE_FUNCTION void
+      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+                  It1 input1,
+                  It2 input2,
+                  int count1,
+                  int count2)
+      {
+        if (IS_FULL_TILE)
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1)
+              output[ITEM] = input1[idx];
+            else
+              output[ITEM] = input2[idx - count1];
+          }
+        }
+        else
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1 + count2)
+            {
+              if (idx < count1)
+                output[ITEM] = input1[idx];
+              else
+                output[ITEM] = input2[idx - count1];
+            }
+          }
+        }
+      }
+
+      template <class T, class It>
+      THRUST_DEVICE_FUNCTION void
+      reg_to_shared(It output,
+                    T (&input)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+          output[idx] = input[ITEM];
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile(Size tile_idx,
+                   Size tile_base,
+                   int  num_remaining)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
 
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename RandomAccessIterator3,
-         typename StrictWeakOrdering>
-__host__ __device__
-RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
-                            RandomAccessIterator1 first1,
-                            RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2,
-                            RandomAccessIterator2 last2,
-                            RandomAccessIterator3 result,
-                            StrictWeakOrdering comp);
+        Size partition_beg = merge_partitions[tile_idx + 0];
+        Size partition_end = merge_partitions[tile_idx + 1];
 
+        Size diag0 = ITEMS_PER_TILE * tile_idx;
+        Size diag1 = (thrust::min)(keys1_count + keys2_count, diag0 + ITEMS_PER_TILE);
 
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
+        // compute bounding box for keys1 & keys2
+        //
+        Size keys1_beg = partition_beg;
+        Size keys1_end = partition_end;
+        Size keys2_beg = diag0 - keys1_beg;
+        Size keys2_end = diag1 - keys1_end;
+
+        // number of keys per tile
+        //
+        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+        key_type keys_loc[ITEMS_PER_THREAD];
+        gmem_to_reg<IS_FULL_TILE>(keys_loc,
+                                  keys1_in + keys1_beg,
+                                  keys2_in + keys2_beg,
+                                  num_keys1,
+                                  num_keys2);
+        reg_to_shared(&storage.keys_shared[0], keys_loc);
+
+        sync_threadblock();
+
+        // use binary search in shared memory
+        // to find merge path for each of thread
+        // we can use int type here, because the number of
+        // items in shared memory is limited
+        //
+        int diag0_loc = min<int>(num_keys1 + num_keys2,
+                                 ITEMS_PER_THREAD * threadIdx.x);
+
+        int keys1_beg_loc = merge_path(&storage.keys_shared[0],
+                                       &storage.keys_shared[num_keys1],
+                                       num_keys1,
+                                       num_keys2,
+                                       diag0_loc,
+                                       compare_op);
+        int keys1_end_loc = num_keys1;
+        int keys2_beg_loc = diag0_loc - keys1_beg_loc;
+        int keys2_end_loc = num_keys2;
+
+        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+        // perform serial merge
+        //
+        int indices[ITEMS_PER_THREAD];
+
+        serial_merge(&storage.keys_shared[0],
+                     keys1_beg_loc,
+                     keys2_beg_loc + num_keys1,
+                     num_keys1_loc,
+                     num_keys2_loc,
+                     keys_loc,
+                     indices,
+                     compare_op);
+
+        sync_threadblock();
+
+        // write keys
+        //
+        if (IS_FULL_TILE)
+        {
+          BlockStoreKeys(storage.store_keys)
+              .Store(keys_out + tile_base, keys_loc);
+        }
+        else
+        {
+          BlockStoreKeys(storage.store_keys)
+              .Store(keys_out + tile_base, keys_loc, num_remaining);
+        }
+
+        // if items are provided, merge them
+        if (MERGE_ITEMS::value)
+        {
+          item_type items_loc[ITEMS_PER_THREAD];
+          gmem_to_reg<IS_FULL_TILE>(items_loc,
+                                    items1_in + keys1_beg,
+                                    items2_in + keys2_beg,
+                                    num_keys1,
+                                    num_keys2);
+
+          sync_threadblock();
+
+          reg_to_shared(&storage.items_shared[0], items_loc);
+
+          sync_threadblock();
+
+          // gather items from shared mem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            items_loc[ITEM] = storage.items_shared[indices[ITEM]];
+          }
+
+          sync_threadblock();
+
+          // write form reg to gmem
+          //
+          if (IS_FULL_TILE)
+          {
+            BlockStoreItems(storage.store_items)
+                .Store(items_out + tile_base, items_loc);
+          }
+          else
+          {
+            BlockStoreItems(storage.store_items)
+                .Store(items_out + tile_base, items_loc, num_remaining);
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage&  storage_,
+           KeysLoadIt1   keys1_in_,
+           KeysLoadIt2   keys2_in_,
+           ItemsLoadIt1  items1_in_,
+           ItemsLoadIt2  items2_in_,
+           Size          keys1_count_,
+           Size          keys2_count_,
+           KeysOutputIt  keys_out_,
+           ItemsOutputIt items_out_,
+           CompareOp     compare_op_,
+           Size*         merge_partitions_)
+          : storage(storage_),
+            keys1_in(keys1_in_),
+            keys2_in(keys2_in_),
+            items1_in(items1_in_),
+            items2_in(items2_in_),
+            keys1_count(keys1_count_),
+            keys2_count(keys2_count_),
+            keys_out(keys_out_),
+            items_out(items_out_),
+            compare_op(compare_op_),
+            merge_partitions(merge_partitions_)
+      {
+        // XXX with 8.5 chaging type to Size (or long long) results in error!
+        int  tile_idx      = blockIdx.x;
+        Size  tile_base     = tile_idx * ITEMS_PER_TILE;
+        int  items_in_tile = static_cast<int>(
+            min<Size>(ITEMS_PER_TILE,
+                      keys1_count + keys2_count - tile_base));
+        if (items_in_tile == ITEMS_PER_TILE)
+        {
+          // full tile
+          consume_tile<true>(tile_idx,
+                             tile_base,
+                             ITEMS_PER_TILE);
+        }
+        else
+        {
+          // partial tile
+          consume_tile<false>(tile_idx,
+                              tile_base,
+                              items_in_tile);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1       keys1_in,
+                       KeysIt2       keys2_in,
+                       ItemsIt1      items1_in,
+                       ItemsIt2      items2_in,
+                       Size          keys1_count,
+                       Size          keys2_count,
+                       KeysOutputIt  keys_out,
+                       ItemsOutputIt items_out,
+                       CompareOp     compare_op,
+                       Size*         merge_partitions,
+                       char*         shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(storage,
+           core::make_load_iterator(ptx_plan(), keys1_in),
+           core::make_load_iterator(ptx_plan(), keys2_in),
+           core::make_load_iterator(ptx_plan(), items1_in),
+           core::make_load_iterator(ptx_plan(), items2_in),
+           keys1_count,
+           keys2_count,
+           keys_out,
+           items_out,
+           compare_op,
+           merge_partitions);
+    }
+  };    // struct MergeAgent;
+
+  //---------------------------------------------------------------------
+  // Two-step internal API
+  //---------------------------------------------------------------------
+
+  template <class MERGE_ITEMS,
+            class KeysIt1,
+            class KeysIt2,
+            class ItemsIt1,
+            class ItemsIt2,
+            class Size,
+            class KeysOutputIt,
+            class ItemsOutputIt,
+            class CompareOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void*         d_temp_storage,
+            size_t&       temp_storage_bytes,
+            KeysIt1       keys1,
+            KeysIt2       keys2,
+            ItemsIt1      items1,
+            ItemsIt2      items2,
+            Size          num_keys1,
+            Size          num_keys2,
+            KeysOutputIt  keys_result,
+            ItemsOutputIt items_result,
+            CompareOp     compare_op,
+            cudaStream_t  stream)
+  {
+    if (num_keys1 + num_keys2 == 0)
+      return cudaErrorNotSupported;
+
+    using core::AgentPlan;
+    using core::get_agent_plan;
+    typedef core::AgentLauncher<
+        MergeAgent<KeysIt1,
+                   KeysIt2,
+                   ItemsIt1,
+                   ItemsIt2,
+                   Size,
+                   KeysOutputIt,
+                   ItemsOutputIt,
+                   CompareOp,
+                   MERGE_ITEMS> >
+        merge_agent;
+
+    typedef core::AgentLauncher<
+        PartitionAgent<KeysIt1,
+                       KeysIt2,
+                       Size,
+                       CompareOp> >
+        partition_agent;
+
+    cudaError_t status = cudaSuccess;
+
+    AgentPlan partition_plan = partition_agent::get_plan();
+    AgentPlan merge_plan     = merge_agent::get_plan(stream);
+
+    int  tile_size = merge_plan.items_per_tile;
+    Size num_tiles = (num_keys1 + num_keys2 + tile_size - 1) / tile_size;
+
+    size_t temp_storage1 = (1 + num_tiles) * sizeof(Size);
+    size_t temp_storage2 = core::vshmem_size(merge_plan.shared_memory_size,
+                                             num_tiles);
+
+    void*  allocations[2]      = {NULL, NULL};
+    size_t allocation_sizes[2] = {temp_storage1, temp_storage2};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_bytes,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    // partition data into work balanced tiles
+    Size* merge_partitions = (Size*)allocations[0];
+    char* vshmem_ptr       = temp_storage2 > 0 ? (char*)allocations[1] : NULL;
+
+    {
+      Size num_partitions = num_tiles + 1;
+
+      partition_agent(partition_plan, num_partitions, stream, "partition agent")
+          .launch(keys1,
+                  keys2,
+                  num_keys1,
+                  num_keys2,
+                  num_partitions,
+                  merge_partitions,
+                  compare_op,
+                  merge_plan.items_per_tile);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    merge_agent(merge_plan, num_keys1 + num_keys2, stream, vshmem_ptr, "merge agent")
+        .launch(keys1,
+                keys2,
+                items1,
+                items2,
+                num_keys1,
+                num_keys2,
+                keys_result,
+                items_result,
+                compare_op,
+                merge_partitions);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return status;
+  }
+
+  template <typename MERGE_ITEMS,
+            typename Derived,
+            typename KeysIt1,
+            typename KeysIt2,
+            typename ItemsIt1,
+            typename ItemsIt2,
+            typename KeysOutputIt,
+            typename ItemsOutputIt,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ItemsOutputIt>
+  merge(execution_policy<Derived>& policy,
+        KeysIt1                    keys1_first,
+        KeysIt1                    keys1_last,
+        KeysIt2                    keys2_first,
+        KeysIt2                    keys2_last,
+        ItemsIt1                   items1_first,
+        ItemsIt2                   items2_first,
+        KeysOutputIt               keys_result,
+        ItemsOutputIt              items_result,
+        CompareOp                  compare_op)
+  {
+    typedef typename iterator_traits<KeysIt1>::difference_type size_type;
+
+    size_type num_keys1
+      = static_cast<size_type>(thrust::distance(keys1_first, keys1_last));
+    size_type num_keys2
+      = static_cast<size_type>(thrust::distance(keys2_first, keys2_last));
+
+    size_type const count = num_keys1 + num_keys2;
+
+    if (count == 0)
+      return thrust::make_pair(keys_result, items_result);
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = doit_step<MERGE_ITEMS>(NULL,
+                                    storage_size,
+                                    keys1_first,
+                                    keys2_first,
+                                    items1_first,
+                                    items2_first,
+                                    num_keys1,
+                                    num_keys2,
+                                    keys_result,
+                                    items_result,
+                                    compare_op,
+                                    stream);
+    cuda_cub::throw_on_error(status, "merge: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_step<MERGE_ITEMS>(ptr,
+                                    storage_size,
+                                    keys1_first,
+                                    keys2_first,
+                                    items1_first,
+                                    items2_first,
+                                    num_keys1,
+                                    num_keys2,
+                                    keys_result,
+                                    items_result,
+                                    compare_op,
+                                    stream);
+    cuda_cub::throw_on_error(status, "merge: failed on 2nd step");
+
+    status = cuda_cub::synchronize_optional(policy);
+    cuda_cub::throw_on_error(status, "merge: failed to synchronize");
+
+    return thrust::make_pair(keys_result + count, items_result + count);
+  }
+}    // namespace __merge
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ResultIt,
+          class CompareOp>
+ResultIt __host__ __device__
+merge(execution_policy<Derived>& policy,
+      KeysIt1                    keys1_first,
+      KeysIt1                    keys1_last,
+      KeysIt2                    keys2_first,
+      KeysIt2                    keys2_last,
+      ResultIt                   result,
+      CompareOp                  compare_op)
+
+{
+  THRUST_CDP_DISPATCH((using keys_type  = thrust::iterator_value_t<KeysIt1>;
+                       keys_type *null_ = nullptr;
+                       auto tmp =
+                         __merge::merge<thrust::detail::false_type>(policy,
+                                                                    keys1_first,
+                                                                    keys1_last,
+                                                                    keys2_first,
+                                                                    keys2_last,
+                                                                    null_,
+                                                                    null_,
+                                                                    result,
+                                                                    null_,
+                                                                    compare_op);
+                       result = tmp.first;),
+                      (result = thrust::merge(cvt_to_seq(derived_cast(policy)),
+                                              keys1_first,
+                                              keys1_last,
+                                              keys2_first,
+                                              keys2_last,
+                                              result,
+                                              compare_op);));
+  return result;
+}
+
+template <class Derived, class KeysIt1, class KeysIt2, class ResultIt>
+ResultIt __host__ __device__
+merge(execution_policy<Derived>& policy,
+      KeysIt1                    keys1_first,
+      KeysIt1                    keys1_last,
+      KeysIt2                    keys2_first,
+      KeysIt2                    keys2_last,
+      ResultIt                   result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
+  return cuda_cub::merge(policy,
+                         keys1_first,
+                         keys1_last,
+                         keys2_first,
+                         keys2_last,
+                         result,
+                         less<keys_type>());
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+merge_by_key(execution_policy<Derived> &policy,
+             KeysIt1                    keys1_first,
+             KeysIt1                    keys1_last,
+             KeysIt2                    keys2_first,
+             KeysIt2                    keys2_last,
+             ItemsIt1                   items1_first,
+             ItemsIt2                   items2_first,
+             KeysOutputIt               keys_result,
+             ItemsOutputIt              items_result,
+             CompareOp                  compare_op)
+{
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __merge::merge<thrust::detail::true_type>(policy,
+                                                     keys1_first,
+                                                     keys1_last,
+                                                     keys2_first,
+                                                     keys2_last,
+                                                     items1_first,
+                                                     items2_first,
+                                                     keys_result,
+                                                     items_result,
+                                                     compare_op);),
+    (ret = thrust::merge_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys1_first,
+                                keys1_last,
+                                keys2_first,
+                                keys2_last,
+                                items1_first,
+                                items2_first,
+                                keys_result,
+                                items_result,
+                                compare_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+merge_by_key(execution_policy<Derived> &policy,
+             KeysIt1                    keys1_first,
+             KeysIt1                    keys1_last,
+             KeysIt2                    keys2_first,
+             KeysIt2                    keys2_last,
+             ItemsIt1                   items1_first,
+             ItemsIt2                   items2_first,
+             KeysOutputIt               keys_result,
+             ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type keys_type;
+  return cuda_cub::merge_by_key(policy,
+                                keys1_first,
+                                keys1_last,
+                                keys2_first,
+                                keys2_last,
+                                items1_first,
+                                items2_first,
+                                keys_result,
+                                items_result,
+                                thrust::less<keys_type>());
+}
 
-#include <thrust/system/cuda/detail/merge.inl>
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/merge.inl b/thrust/system/cuda/detail/merge.inl
deleted file mode 100644
index 4cc934fbd..000000000
--- a/thrust/system/cuda/detail/merge.inl
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/merge.h>
-#include <thrust/detail/seq.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/tabulate.h>
-#include <thrust/iterator/detail/join_iterator.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace merge_detail
-{
-
-
-template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename Size,typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4, typename Compare>
-__device__
-RandomAccessIterator4
-  staged_merge(bulk_::concurrent_group<bulk_::agent<grainsize>,groupsize> &exec,
-               RandomAccessIterator1 first1, Size n1,
-               RandomAccessIterator2 first2, Size n2,
-               RandomAccessIterator3 stage,
-               RandomAccessIterator4 result,
-               Compare comp)
-{
-  // copy into the stage
-  bulk_::copy_n(bulk_::bound<groupsize * grainsize>(exec),
-                thrust::detail::make_join_iterator(first1, n1, first2),
-                n1 + n2,
-                stage);
-
-  // inplace merge in the stage
-  bulk_::inplace_merge(bulk_::bound<groupsize * grainsize>(exec),
-                       stage, stage + n1, stage + n1 + n2,
-                       comp);
-  
-  // copy to the result
-  // XXX this might be slightly faster with a bounded copy_n
-  return bulk_::copy_n(exec, stage, n1 + n2, result);
-} // end staged_merge()
-
-
-struct merge_kernel
-{
-  template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename RandomAccessIterator3, typename RandomAccessIterator4, typename Compare>
-  __device__
-  void operator()(bulk_::concurrent_group<bulk_::agent<grainsize>,groupsize> &g,
-                  RandomAccessIterator1 first1, Size n1,
-                  RandomAccessIterator2 first2, Size n2,
-                  RandomAccessIterator3 merge_paths_first,
-                  RandomAccessIterator4 result,
-                  Compare comp)
-  {
-    typedef int size_type;
-
-    size_type elements_per_group = g.size() * g.this_exec.grainsize();
-
-    // determine the ranges to merge
-    size_type mp0  = merge_paths_first[g.index()];
-    size_type mp1  = merge_paths_first[g.index()+1];
-    size_type diag = elements_per_group * g.index();
-
-    size_type local_size1 = mp1 - mp0;
-    size_type local_size2 = thrust::min<size_type>(n1 + n2, diag + elements_per_group) - mp1 - diag + mp0;
-
-    first1 += mp0;
-    first2 += diag - mp0;
-    result += elements_per_group * g.index();
-
-    // XXX this assumes that RandomAccessIterator2's value_type converts to RandomAccessIterator1's value_type
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-#if __CUDA_ARCH__ >= 200
-    // merge through a stage
-    value_type *stage = reinterpret_cast<value_type*>(bulk_::malloc(g, elements_per_group * sizeof(value_type)));
-
-    if(bulk_::is_on_chip(stage))
-    {
-      staged_merge(g,
-                   first1, local_size1,
-                   first2, local_size2,
-                   bulk_::on_chip_cast(stage),
-                   result,
-                   comp);
-    } // end if
-    else
-    {
-      staged_merge(g,
-                   first1, local_size1,
-                   first2, local_size2,
-                   stage,
-                   result,
-                   comp);
-    } // end else
-
-    bulk_::free(g, stage);
-#else
-    __shared__ bulk_::uninitialized_array<value_type, groupsize * grainsize> stage;
-    staged_merge(g, first1, local_size1, first2, local_size2, stage.data(), result, comp);
-#endif
-  } // end operator()
-}; // end merge_kernel
-
-
-template<typename Size, typename RandomAccessIterator1,typename RandomAccessIterator2, typename Compare>
-struct locate_merge_path
-{
-  Size partition_size;
-  RandomAccessIterator1 first1, last1;
-  RandomAccessIterator2 first2, last2;
-  Compare comp;
-
-  __host__ __device__
-  locate_merge_path(Size partition_size, RandomAccessIterator1 first1, RandomAccessIterator1 last1, RandomAccessIterator2 first2, RandomAccessIterator2 last2, Compare comp)
-    : partition_size(partition_size),
-      first1(first1), last1(last1),
-      first2(first2), last2(last2),
-      comp(comp)
-  {}
-
-  template<typename Index>
-  __device__
-  Size operator()(Index i)
-  {
-    Size n1 = last1 - first1;
-    Size n2 = last2 - first2;
-    Size diag = thrust::min<Size>(partition_size * i, n1 + n2);
-    return bulk_::merge_path(first1, n1, first2, n2, diag, comp);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
-                            RandomAccessIterator1 first1,
-                            RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2,
-                            RandomAccessIterator2 last2,
-                            RandomAccessIterator3 result,
-                            Compare comp)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-  typedef typename thrust::iterator_difference<RandomAccessIterator1>::type difference_type;
-  typedef int size_type;
-
-  // determined through empirical testing on K20c
-  const size_type groupsize = (sizeof(value_type) == sizeof(int)) ? 256 : 256 + 32;
-  const size_type grainsize = (sizeof(value_type) == sizeof(int)) ? 9   : 5;
-  
-  const size_type tile_size = groupsize * grainsize;
-
-  difference_type n = (last1 - first1) + (last2 - first2);
-  difference_type num_groups = (n + tile_size - 1) / tile_size;
-
-  thrust::detail::temporary_array<size_type,DerivedPolicy> merge_paths(exec, num_groups + 1);
-
-  thrust::tabulate(exec, merge_paths.begin(), merge_paths.end(), merge_detail::locate_merge_path<size_type,RandomAccessIterator1,RandomAccessIterator2,Compare>(tile_size,first1,last1,first2,last2,comp));
-
-  // merge partitions
-  size_type heap_size = tile_size * sizeof(value_type);
-  bulk_::concurrent_group<bulk_::agent<grainsize>,groupsize> g(heap_size);
-  bulk_::async(bulk_::par(stream(thrust::detail::derived_cast(exec)), g, num_groups), merge_detail::merge_kernel(), bulk_::root.this_exec, first1, last1 - first1, first2, last2 - first2, merge_paths.begin(), result, comp);
-
-  return result + n;
-} // end merge()
-
-
-} // end merge_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 merge(execution_policy<DerivedPolicy> &exec,
-                            RandomAccessIterator1 first1,
-                            RandomAccessIterator1 last1,
-                            RandomAccessIterator2 first2,
-                            RandomAccessIterator2 last2,
-                            RandomAccessIterator3 result,
-                            Compare comp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator3 parallel_path(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-    {
-      return thrust::system::cuda::detail::merge_detail::merge(exec, first1, last1, first2, last2, result, comp);
-    }
-
-    __host__ __device__
-    static RandomAccessIterator3 sequential_path(execution_policy<DerivedPolicy> &,
-                                                 RandomAccessIterator1 first1,
-                                                 RandomAccessIterator1 last1,
-                                                 RandomAccessIterator2 first2,
-                                                 RandomAccessIterator2 last2,
-                                                 RandomAccessIterator3 result,
-                                                 Compare comp)
-    {
-      return thrust::merge(thrust::seq, first1, last1, first2, last2, result, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first1, last1, first2, last2, result, comp);
-#else
-  return workaround::sequential_path(exec, first1, last1, first2, last2, result, comp);
-#endif
-} // end merge()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/mismatch.h b/thrust/system/cuda/detail/mismatch.h
index c6ae90664..b1e2f44d2 100644
--- a/thrust/system/cuda/detail/mismatch.h
+++ b/thrust/system/cuda/detail/mismatch.h
@@ -1,22 +1,116 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/pair.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2,
+         BinaryPred                 binary_pred);
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2);
+} // namespace cuda_
+THRUST_NAMESPACE_END
+
+#include <thrust/system/cuda/detail/find.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class BinaryPred>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2,
+         BinaryPred                 binary_pred)
+{
+  typedef transform_pair_of_input_iterators_t<bool,
+                                              InputIt1,
+                                              InputIt2,
+                                              BinaryPred>
+      transform_t;
+
+  transform_t transform_first = transform_t(first1, first2, binary_pred);
+
+  transform_t result = cuda_cub::find_if_not(policy,
+                                          transform_first,
+                                          transform_first + thrust::distance(first1, last1),
+                                          identity());
+
+  return thrust::make_pair(first1 + thrust::distance(transform_first,result),
+                           first2 + thrust::distance(transform_first,result));
+}
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2>
+pair<InputIt1, InputIt2> __host__ __device__
+mismatch(execution_policy<Derived>& policy,
+         InputIt1                   first1,
+         InputIt1                   last1,
+         InputIt2                   first2)
+{
+  typedef typename thrust::iterator_value<InputIt1>::type InputType1;
+  return cuda_cub::mismatch(policy,
+                         first1,
+                         last1,
+                         first2,
+                         equal_to<InputType1>());
+}
+
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/par.h b/thrust/system/cuda/detail/par.h
index 57bc014bf..42c701ca7 100644
--- a/thrust/system/cuda/detail/par.h
+++ b/thrust/system/cuda/detail/par.h
@@ -1,82 +1,258 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016-2018, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-#include <thrust/detail/allocator/allocator_traits.h>
+#include <thrust/system/cuda/detail/util.h>
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#include <thrust/detail/allocator_aware_execution_policy.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+#  include <thrust/detail/dependencies_aware_execution_policy.h>
+#endif
 
 
-struct par_t : thrust::system::cuda::detail::execution_policy<par_t>
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived>
+struct execute_on_stream_base : execution_policy<Derived>
 {
-  par_t() : thrust::system::cuda::detail::execution_policy<par_t>() {}
+private:
+  cudaStream_t stream;
 
-  template<typename Allocator>
+public:
+  __thrust_exec_check_disable__
   __host__ __device__
-  typename thrust::detail::enable_if<
-    thrust::detail::is_allocator<Allocator>::value,
-    thrust::detail::execute_with_allocator<Allocator, execute_on_stream_base>
-  >::type
-    operator()(Allocator &alloc) const
+  execute_on_stream_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_){}
+
+  THRUST_RUNTIME_FUNCTION
+  Derived
+  on(cudaStream_t const &s) const
+  {
+    Derived result = derived_cast(*this);
+    result.stream  = s;
+    return result;
+  }
+
+private:
+  friend __host__ __device__
+  cudaStream_t
+  get_stream(const execute_on_stream_base &exec)
   {
-    return thrust::detail::execute_with_allocator<Allocator, execute_on_stream_base>(alloc);
+    return exec.stream;
   }
+};
+
+template <class Derived>
+struct execute_on_stream_nosync_base : execution_policy<Derived>
+{
+private:
+  cudaStream_t stream;
 
+public:
   __host__ __device__
-  inline execute_on_stream on(const cudaStream_t &stream) const
+  execute_on_stream_nosync_base(cudaStream_t stream_ = default_stream())
+      : stream(stream_){}
+
+  THRUST_RUNTIME_FUNCTION
+  Derived
+  on(cudaStream_t const &s) const
   {
-    return execute_on_stream(stream);
+    Derived result = derived_cast(*this);
+    result.stream  = s;
+    return result;
+  }
+
+private:
+  friend __host__ __device__
+  cudaStream_t
+  get_stream(const execute_on_stream_nosync_base &exec)
+  {
+    return exec.stream;
+  }
+
+  friend __host__ __device__
+  bool
+  must_perform_optional_stream_synchronization(const execute_on_stream_nosync_base &)
+  {
+    return false;
   }
 };
 
+struct execute_on_stream : execute_on_stream_base<execute_on_stream>
+{
+  typedef execute_on_stream_base<execute_on_stream> base_t;
+
+  __host__ __device__
+  execute_on_stream() : base_t(){};
+  __host__ __device__
+  execute_on_stream(cudaStream_t stream) 
+  : base_t(stream){};
+};
+
+struct execute_on_stream_nosync : execute_on_stream_nosync_base<execute_on_stream_nosync>
+{
+  typedef execute_on_stream_nosync_base<execute_on_stream_nosync> base_t;
 
-} // end detail
+  __host__ __device__
+  execute_on_stream_nosync() : base_t(){};
+  __host__ __device__
+  execute_on_stream_nosync(cudaStream_t stream) 
+  : base_t(stream){};
+};
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ detail::par_t par;
-#else
-static const detail::par_t par;
+struct par_t : execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    execute_on_stream_base>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<
+    execute_on_stream_base>
 #endif
+{
+  typedef execution_policy<par_t> base_t;
 
+  __host__ __device__
+  constexpr par_t() : base_t() {}
 
-} // end cuda
-} // end system
+  typedef execute_on_stream stream_attachment_type;
 
+  THRUST_RUNTIME_FUNCTION
+  stream_attachment_type
+  on(cudaStream_t const &stream) const
+  {
+    return execute_on_stream(stream);
+  }
+};
 
-// alias par here
-namespace cuda
+struct par_nosync_t : execution_policy<par_nosync_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    execute_on_stream_nosync_base>
+#if THRUST_CPP_DIALECT >= 2011
+, thrust::detail::dependencies_aware_execution_policy<
+    execute_on_stream_nosync_base>
+#endif
 {
+  typedef execution_policy<par_nosync_t> base_t;
+
+  __host__ __device__
+  constexpr par_nosync_t() : base_t() {}
+
+  typedef execute_on_stream_nosync stream_attachment_type;
+
+  THRUST_RUNTIME_FUNCTION
+  stream_attachment_type
+  on(cudaStream_t const &stream) const
+  {
+    return execute_on_stream_nosync(stream);
+  }
+
+private:
+  //this function is defined to allow non-blocking calls on the default_stream() with thrust::cuda::par_nosync
+  //without explicitly using thrust::cuda::par_nosync.on(default_stream())
+  friend __host__ __device__
+  bool
+  must_perform_optional_stream_synchronization(const par_nosync_t &)
+  {
+    return false;
+  }
+};
+
+THRUST_INLINE_CONSTANT par_t par;
 
+/*! \p thrust::cuda::par_nosync is a parallel execution policy targeting Thrust's CUDA device backend.
+ *  Similar to \p thrust::cuda::par it allows execution of Thrust algorithms in a specific CUDA stream.
+ *
+ *  \p thrust::cuda::par_nosync indicates that an algorithm is free to avoid any synchronization of the 
+ *  associated stream that is not strictly required for correctness. Additionally, algorithms may return
+ *  before the corresponding kernels are completed, similar to asynchronous kernel launches via <<< >>> syntax.
+ *  The user must take care to perform explicit synchronization if necessary.
+ *  
+ *  The following code snippet demonstrates how to use \p thrust::cuda::par_nosync :
+ *
+ *  \code
+ *    #include <thrust/device_vector.h>
+ *    #include <thrust/for_each.h>
+ *    #include <thrust/execution_policy.h>
+ *
+ *    struct IncFunctor{
+ *        __host__ __device__
+ *        void operator()(std::size_t& x){ x = x + 1; };
+ *    };
+ *
+ *    int main(){
+ *        std::size_t N = 1000000;
+ *        thrust::device_vector<std::size_t> d_vec(N);
+ *
+ *        cudaStream_t stream;
+ *        cudaStreamCreate(&stream);
+ *        auto nosync_policy = thrust::cuda::par_nosync.on(stream);
+ *
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *        thrust::for_each(nosync_policy, d_vec.begin(), d_vec.end(), IncFunctor{});
+ *
+ *        //for_each may return before completion. Could do other cpu work in the meantime
+ *        // ...
+ *
+ *        //Wait for the completion of all for_each kernels
+ *        cudaStreamSynchronize(stream);
+ *
+ *        std::size_t x = thrust::reduce(nosync_policy, d_vec.begin(), d_vec.end());
+ *        //Currently, this synchronization is not necessary. reduce will still perform
+ *        //implicit synchronization to transfer the reduced value to the host to return it.
+ *        cudaStreamSynchronize(stream);
+ *        cudaStreamDestroy(stream);
+ *    }
+ *  \endcode
+ *
+ */
+THRUST_INLINE_CONSTANT par_nosync_t par_nosync;
+}    // namespace cuda_
 
-using thrust::system::cuda::par;
+namespace system {
+namespace cuda {
+  using thrust::cuda_cub::par;
+  using thrust::cuda_cub::par_nosync;
+  namespace detail {
+    using thrust::cuda_cub::par_t;
+    using thrust::cuda_cub::par_nosync_t;
+  }
+} // namesapce cuda
+} // namespace system
 
+namespace cuda {
+using thrust::cuda_cub::par;
+using thrust::cuda_cub::par_nosync;
+} // namespace cuda
 
-} // end cuda
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/par_to_seq.h b/thrust/system/cuda/detail/par_to_seq.h
new file mode 100644
index 000000000..e710f017b
--- /dev/null
+++ b/thrust/system/cuda/detail/par_to_seq.h
@@ -0,0 +1,86 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/seq.h>
+#include <thrust/system/cuda/detail/par.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <int PAR>
+struct has_par : thrust::detail::true_type {};
+
+template <>
+struct has_par<0> : thrust::detail::false_type {};
+
+template<class Policy>
+struct cvt_to_seq_impl
+{
+  typedef thrust::detail::seq_t seq_t;
+
+  static seq_t __host__ __device__
+  doit(Policy&)
+  {
+    return seq_t();
+  }
+};    // cvt_to_seq_impl
+
+#if 0
+template <class Allocator>
+struct cvt_to_seq_impl<
+    thrust::detail::execute_with_allocator<Allocator,
+                                           execute_on_stream_base> >
+{
+  typedef thrust::detail::execute_with_allocator<Allocator,
+                                                 execute_on_stream_base>
+      Policy;
+  typedef thrust::detail::execute_with_allocator<
+      Allocator,
+      thrust::system::detail::sequential::execution_policy>
+      seq_t;
+
+
+  static seq_t __host__ __device__
+  doit(Policy& policy)
+  {
+    return seq_t(policy.m_alloc);
+  }
+};    // specialization of struct cvt_to_seq_impl
+#endif
+
+template <class Policy>
+typename cvt_to_seq_impl<Policy>::seq_t __host__ __device__
+cvt_to_seq(Policy& policy)
+{
+  return cvt_to_seq_impl<Policy>::doit(policy);
+}
+
+} // namespace cuda_
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/parallel_for.h b/thrust/system/cuda/detail/parallel_for.h
new file mode 100644
index 000000000..43c3297aa
--- /dev/null
+++ b/thrust/system/cuda/detail/parallel_for.h
@@ -0,0 +1,179 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+namespace __parallel_for {
+
+  template <int _BLOCK_THREADS,
+            int _ITEMS_PER_THREAD = 1>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+  };    // struct PtxPolicy
+
+  template <class Arch, class F>
+  struct Tuning;
+
+  template <class F>
+  struct Tuning<sm30, F>
+  {
+    typedef PtxPolicy<256, 2> type;
+  };
+
+
+  template <class F,
+            class Size>
+  struct ParallelForAgent
+  {
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, F>::type
+    {
+      typedef Tuning<Arch, F> tuning;
+    };
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS
+    };
+
+    template <bool IS_FULL_TILE>
+    static void    THRUST_DEVICE_FUNCTION
+    consume_tile(F    f,
+                 Size tile_base,
+                 int  items_in_tile)
+    {
+#pragma unroll
+      for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+      {
+        Size idx = BLOCK_THREADS * ITEM + threadIdx.x;
+        if (IS_FULL_TILE || idx < items_in_tile)
+          f(tile_base + idx);
+      }
+    }
+
+    THRUST_AGENT_ENTRY(F     f,
+                       Size  num_items,
+                       char * /*shmem*/ )
+    {
+      Size tile_base     = static_cast<Size>(blockIdx.x) * ITEMS_PER_TILE;
+      Size num_remaining = num_items - tile_base;
+      Size items_in_tile = static_cast<Size>(
+          num_remaining < ITEMS_PER_TILE ? num_remaining : ITEMS_PER_TILE);
+
+      if (items_in_tile == ITEMS_PER_TILE)
+      {
+        // full tile
+        consume_tile<true>(f, tile_base, ITEMS_PER_TILE);
+      }
+      else
+      {
+        // partial tile
+        consume_tile<false>(f, tile_base, items_in_tile);
+      }
+    }
+  };    // struct ParallelForEagent
+
+  template <class F,
+            class Size>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  parallel_for(Size         num_items,
+               F            f,
+               cudaStream_t stream)
+  {
+    if (num_items == 0)
+      return cudaSuccess;
+    using core::AgentLauncher;
+    using core::AgentPlan;
+
+    typedef AgentLauncher<ParallelForAgent<F, Size> > parallel_for_agent;
+    AgentPlan parallel_for_plan = parallel_for_agent::get_plan(stream);
+
+    parallel_for_agent pfa(parallel_for_plan, num_items, stream, "transform::agent");
+    pfa.launch(f, num_items);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return cudaSuccess;
+  }
+}    // __parallel_for
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class F,
+          class Size>
+void __host__ __device__
+parallel_for(execution_policy<Derived> &policy,
+             F                          f,
+             Size                       count)
+{
+  if (count == 0)
+  {
+    return;
+  }
+
+  // clang-format off
+  THRUST_CDP_DISPATCH(
+    (cudaStream_t stream = cuda_cub::stream(policy);
+     cudaError_t  status = __parallel_for::parallel_for(count, f, stream);
+     cuda_cub::throw_on_error(status, "parallel_for failed");
+     status = cuda_cub::synchronize_optional(policy);
+     cuda_cub::throw_on_error(status, "parallel_for: failed to synchronize");),
+    // CDP sequential impl:
+    (for (Size idx = 0; idx != count; ++idx)
+     {
+       f(idx);
+     }
+  ));
+  // clang-format on
+}
+
+}    // namespace cuda_cub
+
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/partition.h b/thrust/system/cuda/detail/partition.h
index c6ae90664..fad75eb0d 100644
--- a/thrust/system/cuda/detail/partition.h
+++ b/thrust/system/cuda/detail/partition.h
@@ -1,22 +1,1088 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/pair.h>
+#include <thrust/partition.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/agent/single_pass_scan_operators.cuh> // cub::ScanTileState
+#include <cub/block/block_scan.cuh>
+#include <cub/device/device_partition.cuh>
+#include <cub/util_device.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __partition {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class, class>
+  struct Tuning;
+
+  template<class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 10,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<350>
+
+  template<class T>
+  struct Tuning<sm30, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning<300>
+
+  template<int T>
+  struct __tag{};
+
+
+  struct no_stencil_tag_    {};
+  struct single_output_tag_
+  {
+    template<class T>
+    THRUST_DEVICE_FUNCTION T const& operator=(T const& t) const { return t; }
+  };
+
+  typedef no_stencil_tag_* no_stencil_tag;
+  typedef single_output_tag_* single_output_tag;;
+
+  template <class ItemsIt,
+            class StencilIt,
+            class SelectedOutIt,
+            class RejectedOutIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  struct PartitionAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type   item_type;
+    typedef typename iterator_traits<StencilIt>::value_type stencil_type;
+
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch,item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type   ItemsLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, StencilIt>::type StencilLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type   BlockLoadItems;
+      typedef typename core::BlockLoad<PtxPlan, StencilLoadIt>::type BlockLoadStencil;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        } scan_storage;
+
+        typename BlockLoadItems::TempStorage   load_items;
+        typename BlockLoadStencil::TempStorage load_stencil;
+
+        core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE> raw_exchange;
+      };    // union TempStorage
+    };    // struct PtxPlan
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt        ItemsLoadIt;
+    typedef typename ptx_plan::StencilLoadIt      StencilLoadIt;
+    typedef typename ptx_plan::BlockLoadItems     BlockLoadItems;
+    typedef typename ptx_plan::BlockLoadStencil   BlockLoadStencil;
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan          BlockScan;
+    typedef typename ptx_plan::TempStorage        TempStorage;
+
+    enum
+    {
+      SINGLE_OUTPUT    = thrust::detail::is_same<RejectedOutIt, single_output_tag>::value,
+      USE_STENCIL      = !thrust::detail::is_same<StencilIt, no_stencil_tag>::value,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  temp_storage;
+      ScanTileState &tile_state;
+      ItemsLoadIt    items_glob;
+      StencilLoadIt  stencil_glob;
+      SelectedOutIt  selected_out_glob;
+      RejectedOutIt  rejected_out_glob;
+      Predicate      predicate;
+      Size           num_items;
+
+      //---------------------------------------------------------------------
+      // Utilities
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  num_tile_items,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size num_rejected_prefix,
+              Size /*num_selections*/)
+      {
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int item_idx             = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+          int local_selection_idx  = selection_indices[ITEM] - num_selections_prefix;
+          int local_rejection_idx  = item_idx - local_selection_idx;
+          int local_scatter_offset = (selection_flags[ITEM])
+                                         ? tile_num_rejections + local_selection_idx
+                                         : local_rejection_idx;
+
+          temp_storage.raw_exchange[local_scatter_offset] = items[ITEM];
+        }
+
+        core::sync_threadblock();
+
+        // Gather items from shared memory and scatter to global
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int  item_idx       = (ITEM * BLOCK_THREADS) + threadIdx.x;
+          int  rejection_idx  = item_idx;
+          int  selection_idx  = item_idx - tile_num_rejections;
+          Size scatter_offset = (item_idx < tile_num_rejections)
+                                    ? num_items -
+                                          num_rejected_prefix - rejection_idx - 1
+                                    : num_selections_prefix + selection_idx;
+
+          item_type item = temp_storage.raw_exchange[item_idx];
+
+          if (!IS_LAST_TILE || (item_idx < num_tile_items))
+          {
+            if (SINGLE_OUTPUT || item_idx >= tile_num_rejections)
+            {
+              selected_out_glob[scatter_offset] = item;
+            }
+            else    // if !SINGLE_OUTPUT, scatter rejected items separately
+            {
+              rejected_out_glob[num_items - scatter_offset - 1] = item;
+            }
+          }
+        }
+      }    // func scatter
+
+      //------------------------------------------
+      // specialize predicate on different types
+      //------------------------------------------
+
+      enum ItemStencil
+      {
+        ITEM,
+        STENCIL
+      };
+
+      template <bool TAG, class T>
+      struct wrap_value
+      {
+        T const &              x;
+        THRUST_DEVICE_FUNCTION wrap_value(T const &x) : x(x) {}
+
+        THRUST_DEVICE_FUNCTION T const &operator()() const { return x; };
+      };    // struct wrap_type
+
+      //------- item
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &x,
+                        __tag<false /* USE_STENCIL */>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<ITEM, item_type> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+      //-------- stencil
+
+      template <class T>
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, T> const &x,
+                        __tag<true>)
+      {
+        return predicate(x());
+      }
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, no_stencil_tag_> const &,
+                        __tag<true>)
+      {
+        return false;
+      }
+
+
+      THRUST_DEVICE_FUNCTION bool
+      predicate_wrapper(wrap_value<STENCIL, stencil_type> const &,
+                        __tag<false>)
+      {
+        return false;
+      }
+
+      template <bool IS_LAST_TILE, ItemStencil TYPE, class T>
+      THRUST_DEVICE_FUNCTION void
+      compute_selection_flags(int num_tile_items,
+                              T (&values)[ITEMS_PER_THREAD],
+                              Size (&selection_flags)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Out-of-bounds items are selection_flags
+          selection_flags[ITEM] = 1;
+
+          if (!IS_LAST_TILE ||
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+          {
+            selection_flags[ITEM] =
+                predicate_wrapper(wrap_value<TYPE, T>(values[ITEM]),
+                                  __tag<USE_STENCIL>());
+          }
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_glob + tile_base, items_loc, num_tile_items);
+        }
+        else
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_glob + tile_base, items_loc);
+        }
+
+        core::sync_threadblock();
+
+        if (USE_STENCIL)
+        {
+          stencil_type stencil_loc[ITEMS_PER_THREAD];
+
+          if (IS_LAST_TILE)
+          {
+            BlockLoadStencil(temp_storage.load_stencil)
+                .Load(stencil_glob + tile_base, stencil_loc, num_tile_items);
+          }
+          else
+          {
+            BlockLoadStencil(temp_storage.load_stencil)
+                .Load(stencil_glob + tile_base, stencil_loc);
+          }
+
+          compute_selection_flags<IS_LAST_TILE, STENCIL>(num_tile_items,
+                                                         stencil_loc,
+                                                         selection_flags);
+        }
+        else /* Use predicate on items rather then stencil */
+        {
+          compute_selection_flags<IS_LAST_TILE, ITEM>(num_tile_items,
+                                                      items_loc,
+                                                      selection_flags);
+        }
+
+        core::sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        Size num_rejected_prefix   = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.scan_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+          num_rejected_prefix   = tile_base - num_selections_prefix;
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        core::sync_threadblock();
+
+        scatter<IS_LAST_TILE>(items_loc,
+                              selection_flags,
+                              selection_idx,
+                              num_tile_items,
+                              num_tile_selections,
+                              num_selections_prefix,
+                              num_rejected_prefix,
+                              num_selections);
+
+
+        return num_selections;
+      }
+
+
+      template <bool         IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION Size
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           ItemsLoadIt      items_glob_,
+           StencilLoadIt    stencil_glob_,
+           SelectedOutIt    selected_out_glob_,
+           RejectedOutIt    rejected_out_glob_,
+           Predicate        predicate_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            items_glob(items_glob_),
+            stencil_glob(stencil_glob_),
+            selected_out_glob(selected_out_glob_),
+            rejected_out_glob(rejected_out_glob_),
+            predicate(predicate_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }    //
+    };     //struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt          items,
+                       StencilIt        stencil,
+                       SelectedOutIt    selected_out,
+                       RejectedOutIt    rejected_out,
+                       Predicate        predicate,
+                       Size             num_items,
+                       NumSelectedOutIt num_selected_out,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), items),
+           core::make_load_iterator(ptx_plan(), stencil),
+           selected_out,
+           rejected_out,
+           predicate,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };       // struct PartitionAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+  template <class ItemsIt,
+            class StencilIt,
+            class SelectedOutIt,
+            class RejectedOutIt,
+            class Predicate,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            ItemsIt          items,
+            StencilIt        stencil,
+            SelectedOutIt    selected_out,
+            RejectedOutIt    rejected_out,
+            Predicate        predicate,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        PartitionAgent<ItemsIt,
+                       StencilIt,
+                       SelectedOutIt,
+                       RejectedOutIt,
+                       Predicate,
+                       Size,
+                       NumSelectedOutIt> >
+        partition_agent;
+
+    typedef typename partition_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type      init_plan      = init_agent::get_plan();
+    typename get_plan<partition_agent>::type partition_plan = partition_agent::get_plan(stream);
+
+    int tile_size = partition_plan.items_per_tile;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+    size_t vshmem_storage = core::vshmem_size(partition_plan.shared_memory_size,
+                                              num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return status;
+
+    size_t allocation_sizes[2] = {0, vshmem_storage};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+
+    void* allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                                temp_storage_bytes,
+                                                allocations,
+                                                allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status = tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "partition::init_agent");
+
+    char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[1] : NULL;
+
+    partition_agent pa(partition_plan, num_items, stream, vshmem_ptr, "partition::partition_agent");
+
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    pa.launch(items,
+              stencil,
+              selected_out,
+              rejected_out,
+              predicate,
+              num_items,
+              num_selected_out,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+
+  }
+
+  template <typename Derived,
+            typename InputIt,
+            typename StencilIt,
+            typename SelectedOutIt,
+            typename RejectedOutIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  pair<SelectedOutIt, RejectedOutIt>
+  partition(execution_policy<Derived>& policy,
+            InputIt                    first,
+            InputIt                    last,
+            StencilIt                  stencil,
+            SelectedOutIt              selected_result,
+            RejectedOutIt              rejected_result,
+            Predicate                  predicate)
+  {
+    typedef typename iterator_traits<InputIt>::difference_type size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(first, last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       selected_result,
+                       rejected_result,
+                       predicate,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "partition failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "partition failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "partition failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       stencil,
+                       selected_result,
+                       rejected_result,
+                       predicate,
+                       d_num_selected_out,
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "partition failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "partition failed to synchronize");
+
+    size_type num_selected = 0;
+    if (num_items > 0)
+    {
+      num_selected = get_value(policy, d_num_selected_out);
+    }
+
+    return thrust::make_pair(selected_result + num_selected,
+                             rejected_result + num_items - num_selected);
+  }
+
+  template <typename Derived,
+            typename Iterator,
+            typename StencilIt,
+            typename Predicate>
+  THRUST_RUNTIME_FUNCTION
+  Iterator partition_inplace(execution_policy<Derived>& policy,
+                             Iterator                   first,
+                             Iterator                   last,
+                             StencilIt                  stencil,
+                             Predicate                  predicate)
+  {
+    typedef typename iterator_traits<Iterator>::difference_type size_type;
+    typedef typename iterator_traits<Iterator>::value_type      value_type;
+
+    size_type num_items = thrust::distance(first, last);
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<value_type, Derived> tmp(policy, num_items);
+
+    cuda_cub::uninitialized_copy(policy, first, last, tmp.begin());
+
+    pair<Iterator, single_output_tag> result =
+        partition(policy,
+                  tmp.data().get(),
+                  tmp.data().get() + num_items,
+                  stencil,
+                  first,
+                  single_output_tag(),
+                  predicate);
+
+    size_type num_selected = result.first - first;
+
+    return first + num_selected;
+  }
+}    // namespace __partition
+
+///// copy
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+partition_copy(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               StencilIt                  stencil,
+               SelectedOutIt              selected_result,
+               RejectedOutIt              rejected_result,
+               Predicate                  predicate)
+{
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);));
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+partition_copy(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               SelectedOutIt              selected_result,
+               RejectedOutIt              rejected_result,
+               Predicate                  predicate)
+{
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  __partition::no_stencil_tag(),
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::partition_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);));
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+stable_partition_copy(execution_policy<Derived> &policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      SelectedOutIt              selected_result,
+                      RejectedOutIt              rejected_result,
+                      Predicate                  predicate)
+{
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  __partition::no_stencil_tag(),
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         selected_result,
+                                         rejected_result,
+                                         predicate);));
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class SelectedOutIt,
+          class RejectedOutIt,
+          class Predicate>
+pair<SelectedOutIt, RejectedOutIt> __host__ __device__
+stable_partition_copy(execution_policy<Derived> &policy,
+                      InputIt                    first,
+                      InputIt                    last,
+                      StencilIt                  stencil,
+                      SelectedOutIt              selected_result,
+                      RejectedOutIt              rejected_result,
+                      Predicate                  predicate)
+{
+  auto ret = thrust::make_pair(selected_result, rejected_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition(policy,
+                                  first,
+                                  last,
+                                  stencil,
+                                  selected_result,
+                                  rejected_result,
+                                  predicate);),
+    (ret = thrust::stable_partition_copy(cvt_to_seq(derived_cast(policy)),
+                                         first,
+                                         last,
+                                         stencil,
+                                         selected_result,
+                                         rejected_result,
+                                         predicate);));
+  return ret;
+}
+
+/// inplace
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate>
+Iterator __host__ __device__
+partition(execution_policy<Derived> &policy,
+          Iterator                   first,
+          Iterator                   last,
+          StencilIt                  stencil,
+          Predicate                  predicate)
+{
+  THRUST_CDP_DISPATCH(
+    (last =
+       __partition::partition_inplace(policy, first, last, stencil, predicate);),
+    (last = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              stencil,
+                              predicate);));
+  return last;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class Predicate>
+Iterator __host__ __device__
+partition(execution_policy<Derived> &policy,
+          Iterator                   first,
+          Iterator                   last,
+          Predicate                  predicate)
+{
+  THRUST_CDP_DISPATCH(
+    (last = __partition::partition_inplace(policy,
+                                           first,
+                                           last,
+                                           __partition::no_stencil_tag(),
+                                           predicate);),
+    (last = thrust::partition(cvt_to_seq(derived_cast(policy)),
+                              first,
+                              last,
+                              predicate);));
+  return last;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate>
+Iterator __host__ __device__
+stable_partition(execution_policy<Derived> &policy,
+                 Iterator                   first,
+                 Iterator                   last,
+                 StencilIt                  stencil,
+                 Predicate                  predicate)
+{
+  auto ret = last;
+  THRUST_CDP_DISPATCH(
+    (ret =
+       __partition::partition_inplace(policy, first, last, stencil, predicate);
+
+     /* partition returns rejected values in reverse order
+       so reverse the rejected elements to make it stable */
+     cuda_cub::reverse(policy, ret, last);),
+    (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
+                                    first,
+                                    last,
+                                    stencil,
+                                    predicate);));
+  return ret;
+}
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class Iterator,
+          class Predicate>
+Iterator __host__ __device__
+stable_partition(execution_policy<Derived> &policy,
+                 Iterator                   first,
+                 Iterator                   last,
+                 Predicate                  predicate)
+{
+  auto ret = last;
+  THRUST_CDP_DISPATCH(
+    (ret = __partition::partition_inplace(policy,
+                                          first,
+                                          last,
+                                          __partition::no_stencil_tag(),
+                                          predicate);
+
+     /* partition returns rejected values in reverse order
+      so reverse the rejected elements to make it stable */
+     cuda_cub::reverse(policy, ret, last);),
+    (ret = thrust::stable_partition(cvt_to_seq(derived_cast(policy)),
+                                    first,
+                                    last,
+                                    predicate);));
+  return ret;
+}
+
+template <class Derived,
+          class ItemsIt,
+          class Predicate>
+bool __host__ __device__
+is_partitioned(execution_policy<Derived> &policy,
+               ItemsIt                    first,
+               ItemsIt                    last,
+               Predicate                  predicate)
+{
+  ItemsIt boundary = cuda_cub::find_if_not(policy, first, last, predicate);
+  ItemsIt end      = cuda_cub::find_if(policy,boundary,last,predicate);
+  return end == last;
+}
+
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/per_device_resource.h b/thrust/system/cuda/detail/per_device_resource.h
new file mode 100644
index 000000000..414ea7788
--- /dev/null
+++ b/thrust/system/cuda/detail/per_device_resource.h
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <mutex>
+#include <unordered_map>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub
+{
+
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(execution_policy<DerivedPolicy>&)
+{
+    static std::mutex map_lock;
+    static std::unordered_map<int, MR> device_id_to_resource;
+
+    int device_id;
+    thrust::cuda_cub::throw_on_error(cudaGetDevice(&device_id));
+
+    std::lock_guard<std::mutex> lock{map_lock};
+    return &device_id_to_resource[device_id];
+}
+
+}
+
+THRUST_NAMESPACE_END
+
+#endif
+
+#endif
+
diff --git a/thrust/system/cuda/detail/reduce.h b/thrust/system/cuda/detail/reduce.h
index 615b280a2..41d9075da 100644
--- a/thrust/system/cuda/detail/reduce.h
+++ b/thrust/system/cuda/detail/reduce.h
@@ -1,55 +1,1073 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
+#pragma once
 
+#include <thrust/detail/config.h>
 
-/*! \file reduce.h
- *  \brief Reduce a sequence of elements with a given length.
- */
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#pragma once
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/make_unsigned_special.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
+#include <cub/device/device_reduce.cuh>
+#include <cub/util_math.cuh>
 
-namespace thrust
-{
-namespace system
+THRUST_NAMESPACE_BEGIN
+
+// forward declare generic reduce
+// to circumvent circular dependency
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename T,
+          typename BinaryFunction>
+T __host__ __device__
+reduce(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+       InputIterator                                               first,
+       InputIterator                                               last,
+       T                                                           init,
+       BinaryFunction                                              binary_op);
+
+namespace cuda_cub {
+
+namespace __reduce {
+
+  template<bool>
+  struct is_true : thrust::detail::false_type {};
+  template<>
+  struct is_true<true> : thrust::detail::true_type {};
+
+  template <int                       _BLOCK_THREADS,
+            int                       _ITEMS_PER_THREAD   = 1,
+            int                       _VECTOR_LOAD_LENGTH = 1,
+            cub::BlockReduceAlgorithm _BLOCK_ALGORITHM    = cub::BLOCK_REDUCE_RAKING,
+            cub::CacheLoadModifier    _LOAD_MODIFIER      = cub::LOAD_DEFAULT,
+            cub::GridMappingStrategy  _GRID_MAPPING       = cub::GRID_MAPPING_DYNAMIC>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS      = _BLOCK_THREADS,
+      ITEMS_PER_THREAD   = _ITEMS_PER_THREAD,
+      VECTOR_LOAD_LENGTH = _VECTOR_LOAD_LENGTH,
+      ITEMS_PER_TILE     = _BLOCK_THREADS * _ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM;
+    static const cub::CacheLoadModifier    LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::GridMappingStrategy  GRID_MAPPING    = _GRID_MAPPING;
+  }; // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  template <class T>
+  struct Tuning<sm30, T>
+  {
+    enum
+    {
+      // Relative size of T type to a 4-byte word
+      SCALE_FACTOR_4B = (sizeof(T) + 3) / 4,
+      // Relative size of T type to a 1-byte word
+      SCALE_FACTOR_1B = sizeof(T),
+    };
+
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / SCALE_FACTOR_4B),
+                      2,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_DEFAULT,
+                      cub::GRID_MAPPING_RAKE>
+        type;
+  }; // Tuning sm30
+
+  template <class T>
+  struct Tuning<sm35, T> : Tuning<sm30,T>
+  {
+    // ReducePolicy1B (GTX Titan: 228.7 GB/s @ 192M 1B items)
+    typedef PtxPolicy<128,
+                      CUB_MAX(1, 24 / Tuning::SCALE_FACTOR_1B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
+        ReducePolicy1B;
+
+    // ReducePolicy4B types (GTX Titan: 255.1 GB/s @ 48M 4B items)
+    typedef PtxPolicy<256,
+                      CUB_MAX(1, 20 / Tuning::SCALE_FACTOR_4B),
+                      4,
+                      cub::BLOCK_REDUCE_WARP_REDUCTIONS,
+                      cub::LOAD_LDG,
+                      cub::GRID_MAPPING_DYNAMIC>
+        ReducePolicy4B;
+
+    typedef typename thrust::detail::conditional<(sizeof(T) < 4),
+                                                 ReducePolicy1B,
+                                                 ReducePolicy4B>::type type;
+  };    // Tuning sm35
+
+  template <class InputIt,
+            class OutputIt,
+            class T,
+            class Size,
+            class ReductionOp>
+  struct ReduceAgent
+  {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,T>::type
+    {
+      // we need this type definition to indicate "specialize_plan" metafunction
+      // that this PtxPlan may have specializations for different Arch
+      // via Tuning<Arch,T> type.
+      //
+      typedef Tuning<Arch,T> tuning;
+
+      typedef typename cub::CubVector<T, PtxPlan::VECTOR_LOAD_LENGTH> Vector;
+      typedef typename core::LoadIterator<PtxPlan, InputIt>::type     LoadIt;
+      typedef cub::BlockReduce<T,
+                               PtxPlan::BLOCK_THREADS,
+                               PtxPlan::BLOCK_ALGORITHM,
+                               1,
+                               1,
+                               Arch::ver>
+          BlockReduce;
+
+      typedef cub::CacheModifiedInputIterator<PtxPlan::LOAD_MODIFIER,
+                                              Vector,
+                                              Size>
+          VectorLoadIt;
+
+      struct TempStorage
+      {
+        typename BlockReduce::TempStorage reduce;
+        //
+        Size dequeue_offset;
+      };    // struct TempStorage
+
+
+    }; // struct PtxPlan
+
+    // Reduction need additional information which is not covered in
+    // default core::AgentPlan. We thus inherit from core::AgentPlan
+    // and add additional member fields that are needed.
+    // Other algorithms, e.g. merge, may not need additional information,
+    // and may use AgentPlan directly, instead of defining their own Plan type.
+    //
+    struct Plan : core::AgentPlan
+    {
+      cub::GridMappingStrategy grid_mapping;
+
+      THRUST_RUNTIME_FUNCTION
+      Plan() {}
+
+      template <class P>
+      THRUST_RUNTIME_FUNCTION
+          Plan(P) : core::AgentPlan(P()),
+                    grid_mapping(P::GRID_MAPPING)
+      {
+      }
+    };
+
+    // this specialized PtxPlan for a device-compiled Arch
+    // ptx_plan type *must* only be used from device code
+    // Its use from host code will result in *undefined behaviour*
+    //
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::TempStorage  TempStorage;
+    typedef typename ptx_plan::Vector       Vector;
+    typedef typename ptx_plan::LoadIt       LoadIt;
+    typedef typename ptx_plan::BlockReduce  BlockReduce;
+    typedef typename ptx_plan::VectorLoadIt VectorLoadIt;
+
+    enum
+    {
+      ITEMS_PER_THREAD   = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS      = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_TILE     = ptx_plan::ITEMS_PER_TILE,
+      VECTOR_LOAD_LENGTH = ptx_plan::VECTOR_LOAD_LENGTH,
+
+      ATTEMPT_VECTORIZATION = (VECTOR_LOAD_LENGTH > 1) &&
+                              (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                              thrust::detail::is_pointer<InputIt>::value &&
+                              thrust::detail::is_arithmetic<
+                                  typename thrust::detail::remove_cv<T> >::value
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per thread data
+      //---------------------------------------------------------------------
+
+      TempStorage &storage;
+      InputIt      input_it;
+      LoadIt       load_it;
+      ReductionOp  reduction_op;
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &storage_,
+                                  InputIt      input_it_,
+                                  ReductionOp  reduction_op_)
+          : storage(storage_),
+            input_it(input_it_),
+            load_it(core::make_load_iterator(ptx_plan(), input_it)),
+            reduction_op(reduction_op_) {}
+
+      //---------------------------------------------------------------------
+      // Utility
+      //---------------------------------------------------------------------
+
+
+      // Whether or not the input is aligned with the vector type
+      // (specialized for types we can vectorize)
+      //
+      template <class Iterator>
+      static THRUST_DEVICE_FUNCTION bool
+      is_aligned(Iterator d_in,
+                 thrust::detail::true_type /* can_vectorize */)
+      {
+        return (size_t(d_in) & (sizeof(Vector) - 1)) == 0;
+      }
+
+      // Whether or not the input is aligned with the vector type
+      // (specialized for types we cannot vectorize)
+      //
+      template <class Iterator>
+      static THRUST_DEVICE_FUNCTION bool
+      is_aligned(Iterator,
+                 thrust::detail::false_type /* can_vectorize */)
+      {
+        return false;
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      // Consume a full tile of input (non-vectorized)
+      //
+      template <int IS_FIRST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  /*valid_items*/,
+                   thrust::detail::true_type /* is_full_tile */,
+                   thrust::detail::false_type /* can_vectorize */)
+      {
+        T items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        cub::LoadDirectStriped<BLOCK_THREADS>(threadIdx.x,
+                                              load_it + block_offset,
+                                              items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate =
+            (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op)
+                            : cub::internal::ThreadReduce(items, reduction_op,
+                                                          thread_aggregate);
+      }
+
+      // Consume a full tile of input (vectorized)
+      //
+      template <int IS_FIRST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  /*valid_items*/,
+                   thrust::detail::true_type /* is_full_tile */,
+                   thrust::detail::true_type /* can_vectorize */)
+      {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum
+        {
+          WORDS = ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH
+        };
+
+        T items[ITEMS_PER_THREAD];
+
+        Vector *vec_items = reinterpret_cast<Vector *>(items);
+
+        // Vector Input iterator wrapper type (for applying cache modifier)
+        T *d_in_unqualified = const_cast<T *>(input_it) +
+                              block_offset +
+                              (threadIdx.x * VECTOR_LOAD_LENGTH);
+        VectorLoadIt vec_load_it(reinterpret_cast<Vector *>(d_in_unqualified));
+
+#pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+        {
+          vec_items[i] = vec_load_it[BLOCK_THREADS * i];
+        }
+
+
+        // Reduce items within each thread stripe
+        thread_aggregate =
+            (IS_FIRST_TILE) ? cub::internal::ThreadReduce(items, reduction_op)
+                            : cub::internal::ThreadReduce(items, reduction_op,
+                                                          thread_aggregate);
+      }
+
+
+      // Consume a partial tile of input
+      //
+      template <int IS_FIRST_TILE, class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(T &  thread_aggregate,
+                   Size block_offset,
+                   int  valid_items,
+                   thrust::detail::false_type /* is_full_tile */,
+                   CAN_VECTORIZE)
+      {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+          thread_aggregate = load_it[block_offset + thread_offset];
+          thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+          thread_aggregate = reduction_op(
+              thread_aggregate,
+              thrust::raw_reference_cast(load_it[block_offset + thread_offset]));
+          thread_offset += BLOCK_THREADS;
+        }
+      }
+
+      //---------------------------------------------------------------
+      // Consume a contiguous segment of tiles
+      //---------------------------------------------------------------------
+
+
+      // Reduce a contiguous segment of input tiles
+      //
+      template <class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION T
+      consume_range_impl(Size          block_offset,
+                         Size          block_end,
+                         CAN_VECTORIZE can_vectorize)
+      {
+        T thread_aggregate;
+
+        if (block_offset + ITEMS_PER_TILE > block_end)
+        {
+          // First tile isn't full (not all threads have valid items)
+          int valid_items = block_end - block_offset;
+          consume_tile<true>(thread_aggregate,
+                             block_offset,
+                             valid_items,
+                             thrust::detail::false_type(),
+                             can_vectorize);
+          return BlockReduce(storage.reduce)
+              .Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        consume_tile<true>(thread_aggregate,
+                           block_offset,
+                           ITEMS_PER_TILE,
+                           thrust::detail::true_type(),
+                           can_vectorize);
+        block_offset += ITEMS_PER_TILE;
+
+        // Consume subsequent full tiles of input
+        while (block_offset + ITEMS_PER_TILE <= block_end)
+        {
+          consume_tile<false>(thread_aggregate,
+                              block_offset,
+                              ITEMS_PER_TILE,
+                              thrust::detail::true_type(),
+                              can_vectorize);
+          block_offset += ITEMS_PER_TILE;
+        }
+
+        // Consume a partially-full tile
+        if (block_offset < block_end)
+        {
+          int valid_items = block_end - block_offset;
+          consume_tile<false>(thread_aggregate,
+                              block_offset,
+                              valid_items,
+                              thrust::detail::false_type(),
+                              can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduce(storage.reduce)
+            .Reduce(thread_aggregate, reduction_op);
+      }
+
+      // Reduce a contiguous segment of input tiles
+      //
+      THRUST_DEVICE_FUNCTION T consume_range(Size block_offset,
+                                             Size block_end)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION>  path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        return is_aligned(input_it + block_offset, attempt_vec())
+                   ? consume_range_impl(block_offset, block_end, path_a())
+                   : consume_range_impl(block_offset, block_end, path_b());
+      }
+
+      // Reduce a contiguous segment of input tiles
+      //
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles(Size /*num_items*/,
+                    cub::GridEvenShare<Size> &even_share,
+                    cub::GridQueue<UnsignedSize> & /*queue*/,
+                    thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_RAKE> /*is_rake*/)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>          attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION>  path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        // Initialize even-share descriptor for this thread block
+        even_share
+            .template BlockInit<ITEMS_PER_TILE, cub::GRID_MAPPING_RAKE>();
+
+        return is_aligned(input_it, attempt_vec())
+                   ? consume_range_impl(even_share.block_offset,
+                                        even_share.block_end,
+                                        path_a())
+                   : consume_range_impl(even_share.block_offset,
+                                        even_share.block_end,
+                                        path_b());
+      }
+
+
+      //---------------------------------------------------------------------
+      // Dynamically consume tiles
+      //---------------------------------------------------------------------
+
+      // Dequeue and reduce tiles of items as part of a inter-block reduction
+      //
+      template <class CAN_VECTORIZE>
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles_impl(Size                         num_items,
+                         cub::GridQueue<UnsignedSize> queue,
+                         CAN_VECTORIZE                can_vectorize)
+      {
+        using core::sync_threadblock;
+
+        // We give each thread block at least one tile of input.
+        T    thread_aggregate;
+        Size block_offset    = blockIdx.x * ITEMS_PER_TILE;
+        Size even_share_base = gridDim.x * ITEMS_PER_TILE;
+
+        if (block_offset + ITEMS_PER_TILE > num_items)
+        {
+          // First tile isn't full (not all threads have valid items)
+          int valid_items = num_items - block_offset;
+          consume_tile<true>(thread_aggregate,
+                             block_offset,
+                             valid_items,
+                             thrust::detail::false_type(),
+                             can_vectorize);
+          return BlockReduce(storage.reduce)
+              .Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // Consume first full tile of input
+        consume_tile<true>(thread_aggregate,
+                           block_offset,
+                           ITEMS_PER_TILE,
+                           thrust::detail::true_type(),
+                           can_vectorize);
+
+        if (num_items > even_share_base)
+        {
+          // Dequeue a tile of items
+          if (threadIdx.x == 0)
+            storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) +
+                                     even_share_base;
+
+          sync_threadblock();
+
+          // Grab tile offset and check if we're done with full tiles
+          block_offset = storage.dequeue_offset;
+
+          // Consume more full tiles
+          while (block_offset + ITEMS_PER_TILE <= num_items)
+          {
+            consume_tile<false>(thread_aggregate,
+                                block_offset,
+                                ITEMS_PER_TILE,
+                                thrust::detail::true_type(),
+                                can_vectorize);
+
+            sync_threadblock();
+
+            // Dequeue a tile of items
+            if (threadIdx.x == 0)
+              storage.dequeue_offset = queue.Drain(ITEMS_PER_TILE) +
+                                       even_share_base;
+
+            sync_threadblock();
+
+            // Grab tile offset and check if we're done with full tiles
+            block_offset = storage.dequeue_offset;
+          }
+
+          // Consume partial tile
+          if (block_offset < num_items)
+          {
+            int valid_items = num_items - block_offset;
+            consume_tile<false>(thread_aggregate,
+                                block_offset,
+                                valid_items,
+                                thrust::detail::false_type(),
+                                can_vectorize);
+          }
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduce(storage.reduce)
+            .Reduce(thread_aggregate, reduction_op);
+      }
+
+
+      // Dequeue and reduce tiles of items as part of a inter-block reduction
+      //
+      THRUST_DEVICE_FUNCTION T
+      consume_tiles(
+          Size                              num_items,
+          cub::GridEvenShare<Size> &/*even_share*/,
+          cub::GridQueue<UnsignedSize> &    queue,
+          thrust::detail::integral_constant<cub::GridMappingStrategy, cub::GRID_MAPPING_DYNAMIC>)
+      {
+        typedef is_true<ATTEMPT_VECTORIZATION>         attempt_vec;
+        typedef is_true<true && ATTEMPT_VECTORIZATION> path_a;
+        typedef is_true<false && ATTEMPT_VECTORIZATION> path_b;
+
+        return is_aligned(input_it, attempt_vec())
+                   ? consume_tiles_impl(num_items, queue, path_a())
+                   : consume_tiles_impl(num_items, queue, path_b());
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry points
+    //---------------------------------------------------------------------
+
+    // single tile reduce entry point
+    //
+    THRUST_AGENT_ENTRY(InputIt     input_it,
+                       OutputIt    output_it,
+                       Size        num_items,
+                       ReductionOp reduction_op,
+                       char *      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      if (num_items == 0)
+      {
+        return;
+      }
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op).consume_range((Size)0, num_items);
+
+      if (threadIdx.x == 0)
+        *output_it = block_aggregate;
+    }
+
+    // single tile reduce entry point
+    //
+    THRUST_AGENT_ENTRY(InputIt     input_it,
+                       OutputIt    output_it,
+                       Size        num_items,
+                       ReductionOp reduction_op,
+                       T           init,
+                       char *      shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      if (num_items == 0)
+      {
+        if (threadIdx.x == 0)
+          *output_it = init;
+        return;
+      }
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op).consume_range((Size)0, num_items);
+
+      if (threadIdx.x == 0)
+        *output_it = reduction_op(init, block_aggregate);
+    }
+
+    THRUST_AGENT_ENTRY(InputIt                          input_it,
+                       OutputIt                         output_it,
+                       Size                             num_items,
+                       cub::GridEvenShare<Size> even_share,
+                       cub::GridQueue<UnsignedSize>     queue,
+                       ReductionOp                      reduction_op,
+                       char *                           shmem)
+    {
+      TempStorage& storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      typedef thrust::detail::integral_constant<cub::GridMappingStrategy, ptx_plan::GRID_MAPPING> grid_mapping;
+
+      T block_aggregate =
+          impl(storage, input_it, reduction_op)
+              .consume_tiles(num_items, even_share, queue, grid_mapping());
+
+      if (threadIdx.x == 0)
+        output_it[blockIdx.x] = block_aggregate;
+    }
+  };    // struct ReduceAgent
+
+  template<class Size>
+  struct DrainAgent
+  {
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<1> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(cub::GridQueue<UnsignedSize> grid_queue,
+                       Size                         num_items,
+                       char * /*shmem*/)
+    {
+      grid_queue.FillAndResetDrain(num_items);
+    }
+  };    // struct DrainAgent;
+
+
+  template <class InputIt,
+            class OutputIt,
+            class Size,
+            class ReductionOp,
+            class T>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *       d_temp_storage,
+            size_t &     temp_storage_bytes,
+            InputIt      input_it,
+            Size         num_items,
+            T            init,
+            ReductionOp  reduction_op,
+            OutputIt     output_it,
+            cudaStream_t stream)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+    using core::get_agent_plan;
+    using core::cuda_optional;
+
+    typedef typename detail::make_unsigned_special<Size>::type UnsignedSize;
+
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        ReduceAgent<InputIt, OutputIt, T, Size, ReductionOp> >
+        reduce_agent;
+
+    typename reduce_agent::Plan reduce_plan = reduce_agent::get_plan(stream);
+
+    cudaError_t status = cudaSuccess;
+
+
+    if (num_items <= reduce_plan.items_per_tile)
+    {
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size, 1);
+
+      // small, single tile size
+      if (d_temp_storage == NULL)
+      {
+        temp_storage_bytes = max<size_t>(1, vshmem_size);
+        return status;
+      }
+      char *vshmem_ptr = vshmem_size > 0 ? (char*)d_temp_storage : NULL;
+
+      reduce_agent ra(reduce_plan, num_items, stream, vshmem_ptr, "reduce_agent: single_tile only");
+      ra.launch(input_it, output_it, num_items, reduction_op, init);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+    else
+    {
+      // regular size
+      cuda_optional<int> sm_count = core::get_sm_count();
+      CUDA_CUB_RET_IF_FAIL(sm_count.status());
+
+      // reduction will not use more cta counts than requested
+      cuda_optional<int> max_blocks_per_sm =
+          reduce_agent::
+              template get_max_blocks_per_sm<InputIt,
+                                             OutputIt,
+                                             Size,
+                                             cub::GridEvenShare<Size>,
+                                             cub::GridQueue<UnsignedSize>,
+                                             ReductionOp>(reduce_plan);
+      CUDA_CUB_RET_IF_FAIL(max_blocks_per_sm.status());
+
+
+
+      int reduce_device_occupancy = (int)max_blocks_per_sm * sm_count;
+
+      int sm_oversubscription = 5;
+      int max_blocks          = reduce_device_occupancy * sm_oversubscription;
+
+      cub::GridEvenShare<Size> even_share;
+      even_share.DispatchInit(static_cast<int>(num_items), max_blocks,
+                              reduce_plan.items_per_tile);
+
+      // we will launch at most "max_blocks" blocks in a grid
+      // so preallocate virtual shared memory storage for this if required
+      //
+      size_t vshmem_size = core::vshmem_size(reduce_plan.shared_memory_size,
+                                             max_blocks);
+
+      // Temporary storage allocation requirements
+      void * allocations[3] = {NULL, NULL, NULL};
+      size_t allocation_sizes[3] =
+          {
+              max_blocks * sizeof(T),                            // bytes needed for privatized block reductions
+              cub::GridQueue<UnsignedSize>::AllocationSize(),    // bytes needed for grid queue descriptor0
+              vshmem_size                                        // size of virtualized shared memory storage
+          };
+      status = cub::AliasTemporaries(d_temp_storage,
+                                     temp_storage_bytes,
+                                     allocations,
+                                     allocation_sizes);
+      CUDA_CUB_RET_IF_FAIL(status);
+      if (d_temp_storage == NULL)
+      {
+        return status;
+      }
+
+      T *d_block_reductions = (T*) allocations[0];
+      cub::GridQueue<UnsignedSize> queue(allocations[1]);
+      char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[2] : NULL;
+
+
+      // Get grid size for device_reduce_sweep_kernel
+      int reduce_grid_size = 0;
+      if (reduce_plan.grid_mapping == cub::GRID_MAPPING_RAKE)
+      {
+        // Work is distributed evenly
+        reduce_grid_size = even_share.grid_size;
+      }
+      else if (reduce_plan.grid_mapping == cub::GRID_MAPPING_DYNAMIC)
+      {
+        // Work is distributed dynamically
+        size_t num_tiles = cub::DivideAndRoundUp(num_items, reduce_plan.items_per_tile);
+
+        // if not enough to fill the device with threadblocks
+        // then fill the device with threadblocks
+        reduce_grid_size = static_cast<int>((min)(num_tiles, static_cast<size_t>(reduce_device_occupancy)));
+
+        typedef AgentLauncher<DrainAgent<Size> > drain_agent;
+        AgentPlan drain_plan = drain_agent::get_plan();
+        drain_plan.grid_size = 1;
+        drain_agent da(drain_plan, stream, "__reduce::drain_agent");
+        da.launch(queue, num_items);
+        CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+      }
+      else
+      {
+        CUDA_CUB_RET_IF_FAIL(cudaErrorNotSupported);
+      }
+
+      reduce_plan.grid_size = reduce_grid_size;
+      reduce_agent ra(reduce_plan, stream, vshmem_ptr, "reduce_agent: regular size reduce");
+      ra.launch(input_it,
+                d_block_reductions,
+                num_items,
+                even_share,
+                queue,
+                reduction_op);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+
+      typedef AgentLauncher<
+        ReduceAgent<T*, OutputIt, T, Size, ReductionOp> >
+        reduce_agent_single;
+
+      reduce_plan.grid_size = 1;
+      reduce_agent_single ra1(reduce_plan, stream, vshmem_ptr, "reduce_agent: single tile reduce");
+
+      ra1.launch(d_block_reductions, output_it, reduce_grid_size, reduction_op, init);
+      CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    }
+
+    return status;
+  }    // func doit_step
+
+
+  template <typename Derived,
+            typename InputIt,
+            typename Size,
+            typename T,
+            typename BinaryOp>
+  THRUST_RUNTIME_FUNCTION
+  T reduce(execution_policy<Derived>& policy,
+           InputIt                    first,
+           Size                       num_items,
+           T                          init,
+           BinaryOp                   binary_op)
+  {
+    if (num_items == 0)
+      return init;
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       first,
+                       num_items,
+                       init,
+                       binary_op,
+                       reinterpret_cast<T*>(NULL),
+                       stream);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(T*), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
+
+    T* d_result = thrust::detail::aligned_reinterpret_cast<T*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       first,
+                       num_items,
+                       init,
+                       binary_op,
+                       d_result,
+                       stream);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "reduce failed to synchronize");
+
+    T result = cuda_cub::get_value(policy, d_result);
+
+    return result;
+  }
+}    // namespace __reduce
+
+namespace detail {
+
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename T,
+          typename BinaryOp>
+THRUST_RUNTIME_FUNCTION
+T reduce_n_impl(execution_policy<Derived>& policy,
+                InputIt                    first,
+                Size                       num_items,
+                T                          init,
+                BinaryOp                   binary_op)
 {
-namespace cuda
+  cudaStream_t stream = cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary device storage requirements.
+
+  size_t tmp_size = 0;
+
+  THRUST_INDEX_TYPE_DISPATCH(status,
+    cub::DeviceReduce::Reduce,
+    num_items,
+    (NULL, tmp_size, first, reinterpret_cast<T*>(NULL),
+        num_items_fixed, binary_op, init, stream));
+  cuda_cub::throw_on_error(status, "after reduction step 1");
+
+  // Allocate temporary storage.
+
+  thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+    tmp(policy, sizeof(T) + tmp_size);
+
+  // Run reduction.
+
+  // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
+  // `reference`, which has an `operator&` that returns a `pointer`, which
+  // has a `.get` method that returns a raw pointer, which we can (finally)
+  // `static_cast` to `void*`.
+  //
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  T* ret_ptr = thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get());
+  void* tmp_ptr = static_cast<void*>((tmp.data() + sizeof(T)).get());
+  THRUST_INDEX_TYPE_DISPATCH(status,
+    cub::DeviceReduce::Reduce,
+    num_items,
+    (tmp_ptr, tmp_size, first, ret_ptr,
+        num_items_fixed, binary_op, init, stream));
+  cuda_cub::throw_on_error(status, "after reduction step 2");
+
+  // Synchronize the stream and get the value.
+
+  status = cuda_cub::synchronize(policy);
+  cuda_cub::throw_on_error(status, "reduce failed to synchronize");
+
+  // `tmp.begin()` yields a `normal_iterator`, which dereferences to a
+  // `reference`, which has an `operator&` that returns a `pointer`, which
+  // has a `.get` method that returns a raw pointer, which we can (finally)
+  // `static_cast` to `void*`.
+  //
+  // The array was dynamically allocated, so we assume that it's suitably
+  // aligned for any type of data. `malloc`/`cudaMalloc`/`new`/`std::allocator`
+  // make this guarantee.
+  return thrust::cuda_cub::get_value(policy,
+    thrust::detail::aligned_reinterpret_cast<T*>(tmp.data().get()));
+}
+
+} // namespace detail
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename T,
+          typename BinaryOp>
+__host__ __device__
+T reduce_n(execution_policy<Derived>& policy,
+           InputIt                    first,
+           Size                       num_items,
+           T                          init,
+           BinaryOp                   binary_op)
 {
-namespace detail
+  THRUST_CDP_DISPATCH((init =
+                         thrust::cuda_cub::detail::reduce_n_impl(policy,
+                                                                 first,
+                                                                 num_items,
+                                                                 init,
+                                                                 binary_op);),
+                      (init = thrust::reduce(cvt_to_seq(derived_cast(policy)),
+                                             first,
+                                             first + num_items,
+                                             init,
+                                             binary_op);));
+  return init;
+}
+
+template <class Derived, class InputIt, class T, class BinaryOp>
+__host__ __device__
+T reduce(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         T                          init,
+         BinaryOp                   binary_op)
 {
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  // FIXME: Check for RA iterator.
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  return cuda_cub::reduce_n(policy, first, num_items, init, binary_op);
+}
 
+template <class Derived,
+          class InputIt,
+          class T>
+__host__ __device__
+T reduce(execution_policy<Derived> &policy,
+         InputIt                    first,
+         InputIt                    last,
+         T                          init)
+{
+  return cuda_cub::reduce(policy, first, last, init, plus<T>());
+}
 
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
+template <class Derived,
+          class InputIt>
 __host__ __device__
-OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                  InputIterator first,
-                  InputIterator last,
-                  OutputType init,
-                  BinaryFunction binary_op);
+typename iterator_traits<InputIt>::value_type
+reduce(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last)
+{
+  typedef typename iterator_traits<InputIt>::value_type value_type;
+  return cuda_cub::reduce(policy, first, last, value_type(0));
+}
+
 
+} // namespace cuda_cub
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
-#include <thrust/system/cuda/detail/reduce.inl>
+#include <thrust/memory.h>
+#include <thrust/reduce.h>
 
+#endif
diff --git a/thrust/system/cuda/detail/reduce.inl b/thrust/system/cuda/detail/reduce.inl
deleted file mode 100644
index 4bdbf54b1..000000000
--- a/thrust/system/cuda/detail/reduce.inl
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce.inl
- *  \brief Inline file for reduce.h
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/reduce.h>
-#include <thrust/detail/seq.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/system/cuda/detail/decomposition.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-#include <thrust/detail/type_traits.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace reduce_detail
-{
-
-
-struct reduce_partitions
-{
-  template<typename ConcurrentGroup, typename Iterator1, typename Iterator2, typename T, typename BinaryOperation>
-  __device__
-  void operator()(ConcurrentGroup &this_group, Iterator1 first, Iterator1 last, Iterator2 result, T init, BinaryOperation binary_op)
-  {
-    T sum = bulk_::reduce(this_group, first, last, init, binary_op);
-
-    if(this_group.this_exec.index() == 0)
-    {
-      *result = sum;
-    }
-  }
-
-  template<typename ConcurrentGroup, typename Iterator1, typename Iterator2, typename BinaryOperation>
-  __device__
-  void operator()(ConcurrentGroup &this_group, Iterator1 first, Iterator1 last, Iterator2 result, BinaryOperation binary_op)
-  {
-    // noticeably faster to pass the last element as the init
-    typename thrust::iterator_value<Iterator2>::type init = thrust::raw_reference_cast(last[-1]);
-    (*this)(this_group, first, last - 1, result, init, binary_op);
-  }
-
-
-  template<typename ConcurrentGroup, typename Iterator1, typename Decomposition, typename Iterator2, typename T, typename BinaryFunction>
-  __device__
-  void operator()(ConcurrentGroup &this_group, Iterator1 first, Decomposition decomp, Iterator2 result, T init, BinaryFunction binary_op)
-  {
-    typename Decomposition::range range = decomp[this_group.index()];
-
-    Iterator1 last = first + range.second;
-    first += range.first;
-
-    if(this_group.index() != 0)
-    {
-      // noticeably faster to pass the last element as the init 
-      init = thrust::raw_reference_cast(last[-1]);
-      --last;
-    } // end if
-
-    (*this)(this_group, first, last, result + this_group.index(), init, binary_op);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-__host__ __device__
-OutputType tuned_reduce(execution_policy<DerivedPolicy> &exec,
-                        InputIterator first,
-                        InputIterator last,
-                        OutputType init,
-                        BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator>::type size_type;
-
-  const size_type n = last - first;
-
-  if(n <= 0) return init;
-
-  cudaStream_t s = stream(thrust::detail::derived_cast(exec));
-
-  const size_type groupsize = 128;
-  const size_type grainsize = 7;
-  const size_type tile_size = groupsize * grainsize;
-  const size_type num_tiles = (n + tile_size - 1) / tile_size;
-  const size_type subscription = 10;
-
-  bulk_::concurrent_group<
-    bulk_::agent<grainsize>,
-    groupsize
-  > g;
-
-  const size_type num_groups = thrust::min<size_type>(subscription * g.hardware_concurrency(), num_tiles);
-
-  aligned_decomposition<size_type> decomp(n, num_groups, tile_size);
-
-  thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp.size());
-
-  // reduce into partial sums
-  bulk_::async(bulk_::par(s, g, decomp.size()), reduce_detail::reduce_partitions(), bulk_::root.this_exec, first, decomp, partial_sums.begin(), init, binary_op).wait();
-
-  if(partial_sums.size() > 1)
-  {
-    // reduce the partial sums
-    bulk_::async(bulk_::par(s, g, 1), reduce_detail::reduce_partitions(), bulk_::root.this_exec, partial_sums.begin(), partial_sums.end(), partial_sums.begin(), binary_op);
-  } // end while
-
-  return get_value(exec, &partial_sums[0]);
-} // end tuned_reduce()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-__host__ __device__
-OutputType general_reduce(execution_policy<DerivedPolicy> &exec,
-                          InputIterator first,
-                          InputIterator last,
-                          OutputType init,
-                          BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator>::type size_type;
-
-  const size_type n = last - first;
-
-  if(n <= 0) return init;
-
-  cudaStream_t s = stream(thrust::detail::derived_cast(exec));
-
-  typedef thrust::detail::temporary_array<OutputType,DerivedPolicy> temporary_array;
-
-  // automatically choose a number of groups and a group size
-  size_type num_groups = 0;
-  size_type group_size = 0;
-
-  thrust::tie(num_groups, group_size) = bulk_::choose_sizes(bulk_::grid(), reduce_partitions(), bulk_::root.this_exec, first, uniform_decomposition<size_type>(), typename temporary_array::iterator(), init, binary_op);
-
-  num_groups = thrust::min<size_type>(num_groups, thrust::detail::util::divide_ri(n, group_size));
-
-  uniform_decomposition<size_type> decomp(n, num_groups);
-  temporary_array partial_sums(exec, decomp.size());
-
-  // reduce into partial sums
-  bulk_::async(bulk_::grid(decomp.size(), group_size, bulk_::use_default, s), reduce_partitions(), bulk_::root.this_exec, first, decomp, partial_sums.begin(), init, binary_op);
-
-  if(partial_sums.size() > 1)
-  {
-    // need to rechoose the group_size because the type of the kernel launch below differs from the first one
-    thrust::tie(num_groups, group_size) = bulk_::choose_sizes(bulk_::grid(1), reduce_partitions(), bulk_::root.this_exec, partial_sums.begin(), partial_sums.end(), partial_sums.begin(), binary_op);
-
-    // reduce the partial sums
-    bulk_::async(bulk_::grid(num_groups, group_size, bulk_::use_default, s), reduce_partitions(), bulk_::root.this_exec, partial_sums.begin(), partial_sums.end(), partial_sums.begin(), binary_op);
-  } // end while
-
-  return get_value(exec, &partial_sums[0]);
-} // end general_reduce()
-
-
-// use a tuned implementation for arithmetic types
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-__host__ __device__
-typename thrust::detail::enable_if<
-  thrust::detail::is_arithmetic<OutputType>::value,
-  OutputType
->::type
-  reduce(execution_policy<DerivedPolicy> &exec,
-         InputIterator first,
-         InputIterator last,
-         OutputType init,
-         BinaryFunction binary_op)
-{
-  return reduce_detail::tuned_reduce(exec, first, last, init, binary_op);
-} // end reduce()
-
-
-// use a general implementation for non-arithmetic types
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-__host__ __device__
-typename thrust::detail::disable_if<
-  thrust::detail::is_arithmetic<OutputType>::value,
-  OutputType
->::type
-  reduce(execution_policy<DerivedPolicy> &exec,
-         InputIterator first,
-         InputIterator last,
-         OutputType init,
-         BinaryFunction binary_op)
-{
-  return reduce_detail::general_reduce(exec, first, last, init, binary_op);
-} // end reduce()
-
-
-
-} // end reduce_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputType,
-         typename BinaryFunction>
-__host__ __device__
-OutputType reduce(execution_policy<DerivedPolicy> &exec,
-                  InputIterator first,
-                  InputIterator last,
-                  OutputType init,
-                  BinaryFunction binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static OutputType parallel_path(execution_policy<DerivedPolicy> &exec,
-                                    InputIterator first,
-                                    InputIterator last,
-                                    OutputType init,
-                                    BinaryFunction binary_op)
-    {
-      return thrust::system::cuda::detail::reduce_detail::reduce(exec, first, last, init, binary_op);
-    }
-
-    __host__ __device__
-    static OutputType sequential_path(execution_policy<DerivedPolicy> &,
-                                      InputIterator first,
-                                      InputIterator last,
-                                      OutputType init,
-                                      BinaryFunction binary_op)
-    {
-      return thrust::reduce(thrust::seq, first, last, init, binary_op);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, last, init, binary_op);
-#else
-  return workaround::sequential_path(exec, first, last, init, binary_op);
-#endif
-} // end reduce()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/reduce_by_key.h b/thrust/system/cuda/detail/reduce_by_key.h
index cc98a3c61..2933d062a 100644
--- a/thrust/system/cuda/detail/reduce_by_key.h
+++ b/thrust/system/cuda/detail/reduce_by_key.h
@@ -1,62 +1,1209 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
+#pragma once
 
+#include <thrust/detail/config.h>
 
-/*! \file reduce_by_key.h
- *  \brief CUDA implementation of reduce_by_key
- */
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#pragma once
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/raw_reference_cast.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
+#include <cub/device/device_reduce.cuh>
+#include <cub/util_math.cuh>
 
-namespace thrust
-{
-namespace system
+THRUST_NAMESPACE_BEGIN
+
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2,
+          typename BinaryPredicate>
+__host__ __device__ thrust::pair<OutputIterator1, OutputIterator2>
+reduce_by_key(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator1                                              keys_first,
+    InputIterator1                                              keys_last,
+    InputIterator2                                              values_first,
+    OutputIterator1                                             keys_output,
+    OutputIterator2                                             values_output,
+    BinaryPredicate                                             binary_pred);
+
+namespace cuda_cub {
+
+namespace __reduce_by_key {
+
+  template<bool> struct is_true : thrust::detail::false_type {};
+  template<> struct is_true<true> : thrust::detail::true_type {};
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_DEFAULT,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = BLOCK_THREADS * ITEMS_PER_THREAD
+    };
+
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template <class Arch, class Key, class Value>
+  struct Tuning;
+
+  template <class Key, class Value>
+  struct Tuning<sm30, Key, Value>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES)>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm30
+
+  template<class Key, class Value>
+  struct Tuning<sm35,Key,Value> : Tuning<sm30,Key,Value>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
+      NOMINAL_4B_ITEMS_PER_THREAD = 6,
+
+      ITEMS_PER_THREAD =
+          (MAX_INPUT_BYTES <= 8)
+              ? 6
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         COMBINED_INPUT_BYTES - 1) /
+                            COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm35
+
+  template<class Key, class Value>
+  struct Tuning<sm52,Key,Value> : Tuning<sm30,Key,Value>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES      = mpl::max<size_t, sizeof(Key), sizeof(Value)>::value,
+      COMBINED_INPUT_BYTES = sizeof(Key) + sizeof(Value),
+
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+
+      ITEMS_PER_THREAD =
+          (MAX_INPUT_BYTES <= 8)
+              ? 9
+              : mpl::min<
+                    int,
+                    NOMINAL_4B_ITEMS_PER_THREAD,
+                    mpl::max<
+                        int,
+                        1,
+                        ((NOMINAL_4B_ITEMS_PER_THREAD * 8) +
+                         COMBINED_INPUT_BYTES - 1) /
+                            COMBINED_INPUT_BYTES>::value>::value,
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning sm52
+
+  template <class KeysInputIt,
+            class ValuesInputIt,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class EqualityOp,
+            class ReductionOp,
+            class NumRunsOutputIt,
+            class Size>
+  struct ReduceByKeyAgent
+  {
+    typedef typename iterator_traits<KeysInputIt>::value_type   key_type;
+    typedef typename iterator_traits<ValuesInputIt>::value_type value_type;
+    typedef Size                                                size_type;
+
+    typedef cub::KeyValuePair<size_type, value_type> size_value_pair_t;
+    typedef cub::KeyValuePair<key_type, value_type>  key_value_pair_t;
+
+    typedef cub::ReduceByKeyScanTileState<value_type, size_type> ScanTileState;
+    typedef cub::ReduceBySegmentOp<ReductionOp> ReduceBySegmentOp;
+
+    template<class Arch>
+    struct PtxPlan : Tuning<Arch,key_type, value_type>::type
+    {
+      typedef Tuning<Arch, key_type, value_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysInputIt>::type    KeysLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ValuesInputIt>::type  ValuesLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt>::type   BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt>::type BlockLoadValues;
+
+      typedef cub::BlockDiscontinuity<key_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityKeys;
+
+      typedef cub::TilePrefixCallbackOp<size_value_pair_t,
+                                        ReduceBySegmentOp,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<size_value_pair_t,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage              scan;
+          typename TilePrefixCallback::TempStorage     prefix;
+          typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        } scan_storage;
+
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadValues::TempStorage load_values;
+
+        core::uninitialized_array<key_value_pair_t, PtxPlan::ITEMS_PER_TILE + 1>
+          raw_exchange;
+      };    // union TempStorage
+    };  // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt             KeysLoadIt;
+    typedef typename ptx_plan::ValuesLoadIt           ValuesLoadIt;
+    typedef typename ptx_plan::BlockLoadKeys          BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadValues        BlockLoadValues;
+    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
+    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan              BlockScan;
+    typedef typename ptx_plan::TempStorage            TempStorage;
+
+    enum
+    {
+      BLOCK_THREADS     = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD  = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE    = ptx_plan::ITEMS_PER_TILE,
+      TWO_PHASE_SCATTER = (ITEMS_PER_THREAD > 1),
+
+      // Whether or not the scan operation has a zero-valued identity value
+      // (true if we're performing addition on a primitive type)
+      HAS_IDENTITY_ZERO = thrust::detail::is_same<ReductionOp,
+                                                  plus<value_type> >::value &&
+                          thrust::detail::is_arithmetic<value_type>::value
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      storage;
+      KeysLoadIt                         keys_load_it;
+      ValuesLoadIt                       values_load_it;
+      KeysOutputIt                       keys_output_it;
+      ValuesOutputIt                     values_output_it;
+      NumRunsOutputIt                    num_runs_output_it;
+      cub::InequalityWrapper<EqualityOp> inequality_op;
+      ReduceBySegmentOp                  scan_op;
+
+      //---------------------------------------------------------------------
+      // Block scan utility methods
+      //---------------------------------------------------------------------
+
+      // Scan with identity (first tile)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::true_type /* has_identity */)
+      {
+        size_value_pair_t identity;
+        identity.value = 0;
+        identity.key   = 0;
+        BlockScan(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items, scan_items, identity, scan_op, tile_aggregate);
+      }
+
+      // Scan without identity (first tile).
+      // Without an identity, the first output item is undefined.
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t &tile_aggregate,
+                thrust::detail::false_type /* has_identity */)
+      {
+        BlockScan(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items, scan_items, scan_op, tile_aggregate);
+      }
+
+      // Scan with identity (subsequent tile)
+      //
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::true_type /*  has_identity */)
+      {
+        BlockScan(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items,
+                           scan_items,
+                           scan_op,
+                           prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      // Scan without identity (subsequent tile).
+      // Without an identity, the first output item is undefined.
+      THRUST_DEVICE_FUNCTION void
+      scan_tile(size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+                size_value_pair_t & tile_aggregate,
+                TilePrefixCallback &prefix_op,
+                thrust::detail::false_type /* has_identity */)
+      {
+        BlockScan(storage.scan_storage.scan)
+            .ExclusiveScan(scan_items,
+                           scan_items,
+                           scan_op,
+                           prefix_op);
+        tile_aggregate = prefix_op.GetBlockAggregate();
+      }
+
+      //---------------------------------------------------------------------
+      // Zip utility methods
+      //---------------------------------------------------------------------
+
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      zip_values_and_flags(size_type num_remaining,
+                           value_type (&values)[ITEMS_PER_THREAD],
+                           size_type (&segment_flags)[ITEMS_PER_THREAD],
+                           size_value_pair_t (&scan_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set segment_flags for first out-of-bounds item, zero for others
+          if (IS_LAST_TILE &&
+              Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM == num_remaining)
+            segment_flags[ITEM] = 1;
+
+          scan_items[ITEM].value = values[ITEM];
+          scan_items[ITEM].key   = segment_flags[ITEM];
+        }
+      }
+
+      THRUST_DEVICE_FUNCTION void zip_keys_and_values(
+          key_type (&keys)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_value_pair_t (&scan_items)[ITEMS_PER_THREAD],
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD])
+      {
+        // Zip values and segment_flags
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          scatter_items[ITEM].key   = keys[ITEM];
+          scatter_items[ITEM].value = scan_items[ITEM].value;
+          segment_indices[ITEM]     = scan_items[ITEM].key;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Scatter utility methods
+      //---------------------------------------------------------------------
+
+      // Directly scatter flagged items to output offsets
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false)
+      THRUST_DEVICE_FUNCTION void scatter_direct(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD])
+      {
+        // Scatter flagged keys and values
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (segment_flags[ITEM])
+          {
+            keys_output_it[segment_indices[ITEM]] = scatter_items[ITEM].key;
+            values_output_it[segment_indices[ITEM]] = scatter_items[ITEM].value;
+          }
+        }
+      }
+
+      // 2-phase scatter flagged items to output offsets
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false
+      //
+      // The exclusive scan causes each head flag to be paired with
+      // the previous value aggregate:
+      //   * the scatter offsets must be decremented for value aggregates
+      //
+      THRUST_DEVICE_FUNCTION void scatter_two_phase(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_type num_tile_segments,
+          size_type num_tile_segments_prefix)
+      {
+        using core::sync_threadblock;
+
+        sync_threadblock();
+
+        // Compact and scatter keys
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (segment_flags[ITEM])
+          {
+            int idx = static_cast<int>(segment_indices[ITEM] -
+                                       num_tile_segments_prefix);
+            storage.raw_exchange[idx] = scatter_items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+          size_type        idx  = num_tile_segments_prefix + item;
+          key_value_pair_t pair = storage.raw_exchange[item];
+          keys_output_it[idx]   = pair.key;
+          values_output_it[idx] = pair.value;
+        }
+      }
+
+
+      // Scatter flagged items
+      //
+      THRUST_DEVICE_FUNCTION void scatter(
+          key_value_pair_t (&scatter_items)[ITEMS_PER_THREAD],
+          size_type (&segment_flags)[ITEMS_PER_THREAD],
+          size_type (&segment_indices)[ITEMS_PER_THREAD],
+          size_type num_tile_segments,
+          size_type num_tile_segments_prefix)
+      {
+        // Do a one-phase scatter if (a) two-phase is disabled or
+        // (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+          scatter_two_phase(scatter_items,
+                            segment_flags,
+                            segment_indices,
+                            num_tile_segments,
+                            num_tile_segments_prefix);
+        }
+        else
+        {
+          scatter_direct(scatter_items,
+                         segment_flags,
+                         segment_indices);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Finalization utility methods
+      //---------------------------------------------------------------------
+
+      // Finalize the carry-out from the last tile
+      // (specialized for IS_SEGMENTED_REDUCTION_FIXUP == false)
+      THRUST_DEVICE_FUNCTION void
+      finalize_last_tile(size_type num_segments,
+                         size_type num_remaining,
+                         key_type    last_key,
+                         value_type  last_value)
+      {
+        // Last thread will output final count and last item, if necessary
+        if (threadIdx.x == BLOCK_THREADS - 1)
+        {
+          // If the last tile is a whole tile, the inclusive prefix
+          // contains accumulated value reduction for the last segment
+          if (num_remaining == ITEMS_PER_TILE)
+          {
+            // Scatter key and value
+            keys_output_it[num_segments]   = last_key;
+            values_output_it[num_segments] = last_value;
+            num_segments++;
+          }
+
+          // Output the total number of items selected
+          *num_runs_output_it = num_segments;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Cooperatively scan a device-wide sequence of tiles with other CTAs
+      //---------------------------------------------------------------------
+
+      // Process first tile of input (dynamic chained scan).
+      // Returns the running  count of segments
+      // and aggregated values (including this tile)
+      //
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_first_tile(Size           num_remaining,
+                         Size           tile_offset,
+                         ScanTileState &tile_state)
+      {
+        using core::sync_threadblock;
+
+        key_type          keys[ITEMS_PER_THREAD];               // Tile keys
+        key_type          pred_keys[ITEMS_PER_THREAD];          // Tile keys shifted up (predecessor)
+        value_type        values[ITEMS_PER_THREAD];             // Tile values
+        size_type         segment_flags[ITEMS_PER_THREAD];      // Segment head flags
+        size_type         segment_indices[ITEMS_PER_THREAD];    // Segment indices
+        size_value_pair_t scan_items[ITEMS_PER_THREAD];         // Zipped values and segment flags|indices
+        key_value_pair_t  scatter_items[ITEMS_PER_THREAD];      // Zipped key value pairs for scattering
+
+        // Load keys (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset,
+                    keys,
+                    num_remaining,
+                    *(keys_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset, keys);
+        }
+
+        sync_threadblock();
+
+        // Load values (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset,
+                    values,
+                    num_remaining,
+                    *(values_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset, values);
+        }
+
+        sync_threadblock();
+
+        // Set head segment_flags.
+        // First tile sets the first flag for the first item
+        BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
+            .FlagHeads(segment_flags, keys, pred_keys, inequality_op);
+
+        // Unset the flag for the first item in the first tile
+        // so we won't scatter it
+        //
+        if (threadIdx.x == 0)
+          segment_flags[0] = 0;
+
+        // Zip values and segment_flags
+        zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                           values,
+                                           segment_flags,
+                                           scan_items);
+
+        // Exclusive scan of values and segment_flags
+        size_value_pair_t tile_aggregate;
+        scan_tile(scan_items, tile_aggregate, is_true<HAS_IDENTITY_ZERO>());
+
+        if (threadIdx.x == 0)
+        {
+          // Update tile status if this is not the last tile
+          if (!IS_LAST_TILE)
+            tile_state.SetInclusive(0, tile_aggregate);
+
+          // Initialize the segment index for the first scan item if necessary
+          // (the exclusive prefix for the first item is garbage)
+          if (!HAS_IDENTITY_ZERO)
+            scan_items[0].key = 0;
+        }
+
+        // Unzip values and segment indices
+        zip_keys_and_values(pred_keys,
+                            segment_indices,
+                            scan_items,
+                            scatter_items);
+
+        // Scatter flagged items
+        scatter(scatter_items,
+                segment_flags,
+                segment_indices,
+                tile_aggregate.key,
+                0);
+
+        if (IS_LAST_TILE)
+        {
+          // Finalize the carry-out from the last tile
+          finalize_last_tile(tile_aggregate.key,
+                             num_remaining,
+                             keys[ITEMS_PER_THREAD - 1],
+                             tile_aggregate.value);
+        }
+      }
+
+      // Process subsequent tile of input (dynamic chained scan).
+      // Returns the running count of segments
+      // and aggregated values (including this tile)
+
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_subsequent_tile(Size           num_remaining,
+                              int            tile_idx,
+                              Size           tile_offset,
+                              ScanTileState &tile_state)
+      {
+        using core::sync_threadblock;
+
+        key_type          keys[ITEMS_PER_THREAD];               // Tile keys
+        key_type          pred_keys[ITEMS_PER_THREAD];          // Tile keys shifted up (predecessor)
+        value_type        values[ITEMS_PER_THREAD];             // Tile values
+        size_type         segment_flags[ITEMS_PER_THREAD];      // Segment head flags
+        size_type         segment_indices[ITEMS_PER_THREAD];    // Segment indices
+        size_value_pair_t scan_items[ITEMS_PER_THREAD];         // Zipped values and segment flags|indices
+        key_value_pair_t  scatter_items[ITEMS_PER_THREAD];      // Zipped key value pairs for scattering
+
+        // Load keys (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset,
+                    keys,
+                    num_remaining,
+                    *(keys_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadKeys(storage.load_keys)
+              .Load(keys_load_it + tile_offset, keys);
+        }
+
+        key_type tile_pred_key = (threadIdx.x == 0)
+                                     ? keys_load_it[tile_offset - 1]
+                                     : key_type();
+
+        sync_threadblock();
+
+        // Load values (last tile repeats final element)
+        if (IS_LAST_TILE)
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset,
+                    values,
+                    num_remaining,
+                    *(values_load_it + tile_offset));
+        }
+        else
+        {
+          BlockLoadValues(storage.load_values)
+              .Load(values_load_it + tile_offset, values);
+        }
+
+        sync_threadblock();
+
+        // Set head segment_flags
+        BlockDiscontinuityKeys(storage.scan_storage.discontinuity)
+            .FlagHeads(segment_flags,
+                       keys,
+                       pred_keys,
+                       inequality_op,
+                       tile_pred_key);
+
+        // Zip values and segment_flags
+        zip_values_and_flags<IS_LAST_TILE>(num_remaining,
+                                           values,
+                                           segment_flags,
+                                           scan_items);
+
+        // Exclusive scan of values and segment_flags
+        size_value_pair_t  tile_aggregate;
+        TilePrefixCallback prefix_op(tile_state, storage.scan_storage.prefix, scan_op, tile_idx);
+        scan_tile(scan_items,
+                  tile_aggregate,
+                  prefix_op,
+                  is_true<HAS_IDENTITY_ZERO>());
+        size_value_pair_t tile_inclusive_prefix = prefix_op.GetInclusivePrefix();
+
+        // Unzip values and segment indices
+        zip_keys_and_values(pred_keys, segment_indices, scan_items, scatter_items);
+
+        // Scatter flagged items
+        scatter(scatter_items,
+                segment_flags,
+                segment_indices,
+                tile_aggregate.key,
+                prefix_op.GetExclusivePrefix().key);
+
+        if (IS_LAST_TILE)
+        {
+          // Finalize the carry-out from the last tile
+          finalize_last_tile(tile_inclusive_prefix.key,
+                             num_remaining,
+                             keys[ITEMS_PER_THREAD - 1],
+                             tile_inclusive_prefix.value);
+        }
+      }
+      template <bool IS_LAST_TILE>
+      THRUST_DEVICE_FUNCTION void
+      consume_tile(size_type      num_remaining,
+                   int            tile_idx,
+                   size_type      tile_offset,
+                   ScanTileState &tile_state)
+      {
+        if (tile_idx == 0)
+        {
+          consume_first_tile<IS_LAST_TILE>(num_remaining,
+                                           tile_offset,
+                                           tile_state);
+        }
+        else
+        {
+          consume_subsequent_tile<IS_LAST_TILE>(num_remaining,
+                                                tile_idx,
+                                                tile_offset,
+                                                tile_state);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor : consume_range
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION impl(TempStorage &   storage_,
+                                  KeysInputIt     keys_input_it_,
+                                  ValuesInputIt   values_input_it_,
+                                  KeysOutputIt    keys_output_it_,
+                                  ValuesOutputIt  values_output_it_,
+                                  NumRunsOutputIt num_runs_output_it_,
+                                  EqualityOp      equality_op_,
+                                  ReductionOp     reduction_op_,
+                                  Size            num_items,
+                                  int             /*num_tiles*/,
+                                  ScanTileState & tile_state)
+          : storage(storage_),
+            keys_load_it(core::make_load_iterator(ptx_plan(), keys_input_it_)),
+            values_load_it(core::make_load_iterator(ptx_plan(), values_input_it_)),
+            keys_output_it(keys_output_it_),
+            values_output_it(values_output_it_),
+            num_runs_output_it(num_runs_output_it_),
+            inequality_op(equality_op_),
+            scan_op(reduction_op_)
+      {
+        // Blocks are launched in increasing order,
+        // so just assign one tile per block
+        //
+        int  tile_idx          = blockIdx.x;
+        Size tile_offset       = static_cast<Size>(tile_idx) * ITEMS_PER_TILE;
+        Size num_remaining     = num_items - tile_offset;
+
+        if (num_remaining > ITEMS_PER_TILE)
+        {
+          // Not the last tile (full)
+          consume_tile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+          // The last tile (possibly partially-full)
+          consume_tile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysInputIt     keys_input_it,
+                       ValuesInputIt   values_input_it,
+                       KeysOutputIt    keys_output_it,
+                       ValuesOutputIt  values_output_it,
+                       NumRunsOutputIt num_runs_output_it,
+                       ScanTileState   tile_state,
+                       EqualityOp      equality_op,
+                       ReductionOp     reduction_op,
+                       Size            num_items,
+                       int             num_tiles,
+                       char *          shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage*>(shmem);
+
+      impl(storage,
+           keys_input_it,
+           values_input_it,
+           keys_output_it,
+           values_output_it,
+           num_runs_output_it,
+           equality_op,
+           reduction_op,
+           num_items,
+           num_tiles,
+           tile_state);
+    }
+
+  };    // struct ReduceByKeyAgent
+
+  template <class ScanTileState,
+            class Size,
+            class NumSelectedIt>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char *        /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+  }; // struct InitAgent
+
+  template <class KeysInputIt,
+            class ValuesInputIt,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class NumRunsOutputIt,
+            class EqualityOp,
+            class ReductionOp,
+            class Size>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *          d_temp_storage,
+            size_t &        temp_storage_bytes,
+            KeysInputIt     keys_input_it,
+            ValuesInputIt   values_input_it,
+            KeysOutputIt    keys_output_it,
+            ValuesOutputIt  values_output_it,
+            NumRunsOutputIt num_runs_output_it,
+            EqualityOp      equality_op,
+            ReductionOp     reduction_op,
+            Size            num_items,
+            cudaStream_t    stream)
+  {
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    cudaError_t status = cudaSuccess;
+    if (num_items == 0)
+      return cudaErrorNotSupported;
+
+    typedef AgentLauncher<
+        ReduceByKeyAgent<KeysInputIt,
+                         ValuesInputIt,
+                         KeysOutputIt,
+                         ValuesOutputIt,
+                         EqualityOp,
+                         ReductionOp,
+                         NumRunsOutputIt,
+                         Size> >
+        reduce_by_key_agent;
+
+    typedef typename reduce_by_key_agent::ScanTileState ScanTileState;
+    typedef AgentLauncher<
+        InitAgent<ScanTileState,
+                  Size,
+                  NumRunsOutputIt> >
+        init_agent;
+
+    AgentPlan reduce_by_key_plan = reduce_by_key_agent::get_plan(stream);
+    AgentPlan init_plan          = init_agent::get_plan();
+
+    // Number of input tiles
+    int  tile_size = reduce_by_key_plan.items_per_tile;
+    Size num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+    size_t vshmem_size = core::vshmem_size(reduce_by_key_plan.shared_memory_size,
+                                           num_tiles);
+
+    size_t allocation_sizes[2] = {9, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    init_agent ia(init_plan, num_tiles, stream, "reduce_by_key::init_agent");
+    ia.launch(tile_state, num_tiles, num_runs_output_it);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    reduce_by_key_agent rbka(reduce_by_key_plan,
+                             num_items,
+                             stream,
+                             vshmem_ptr,
+                             "reduce_by_keys::reduce_by_key_agent");
+    rbka.launch(keys_input_it,
+                values_input_it,
+                keys_output_it,
+                values_output_it,
+                num_runs_output_it,
+                tile_state,
+                equality_op,
+                reduction_op,
+                num_items,
+                num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Size,
+            typename Derived,
+            typename KeysInputIt,
+            typename ValuesInputIt,
+            typename KeysOutputIt,
+            typename ValuesOutputIt,
+            typename EqualityOp,
+            typename ReductionOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  reduce_by_key_dispatch(execution_policy<Derived>& policy,
+                         KeysInputIt                keys_first,
+                         Size                       num_items,
+                         ValuesInputIt              values_first,
+                         KeysOutputIt               keys_output,
+                         ValuesOutputIt             values_output,
+                         EqualityOp                 equality_op,
+                         ReductionOp                reduction_op)
+  {
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    if (num_items == 0)
+    {
+      return thrust::make_pair(keys_output, values_output);
+    }
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       keys_first,
+                       values_first,
+                       keys_output,
+                       values_output,
+                       reinterpret_cast<Size*>(NULL),
+                       equality_op,
+                       reduction_op,
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "reduce_by_key failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(Size), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "reduce failed on 2nd alias_storage");
+
+    Size* d_num_runs_out
+      = thrust::detail::aligned_reinterpret_cast<Size*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       keys_first,
+                       values_first,
+                       keys_output,
+                       values_output,
+                       d_num_runs_out,
+                       equality_op,
+                       reduction_op,
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "reduce_by_key failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "reduce_by_key: failed to synchronize");
+
+    int num_runs_out = cuda_cub::get_value(policy, d_num_runs_out);
+
+    return thrust::make_pair(
+      keys_output + num_runs_out,
+      values_output + num_runs_out
+    );
+  }
+
+  template <typename Derived,
+            typename KeysInputIt,
+            typename ValuesInputIt,
+            typename KeysOutputIt,
+            typename ValuesOutputIt,
+            typename EqualityOp,
+            typename ReductionOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  reduce_by_key(execution_policy<Derived>& policy,
+                KeysInputIt                keys_first,
+                KeysInputIt                keys_last,
+                ValuesInputIt              values_first,
+                KeysOutputIt               keys_output,
+                ValuesOutputIt             values_output,
+                EqualityOp                 equality_op,
+                ReductionOp                reduction_op)
+  {
+    using size_type = typename iterator_traits<KeysInputIt>::difference_type;
+
+    size_type num_items = thrust::distance(keys_first, keys_last);
+
+    pair<KeysOutputIt, ValuesOutputIt> result = thrust::make_pair(keys_output, values_output);
+
+    if (num_items == 0)
+    {
+      return result;
+    }
+
+    THRUST_INDEX_TYPE_DISPATCH(result,
+                               reduce_by_key_dispatch,
+                               num_items,
+                               (policy,
+                                keys_first,
+                                num_items_fixed,
+                                values_first,
+                                keys_output,
+                                values_output,
+                                equality_op,
+                                reduction_op));
+
+    return result;
+  }
+
+}    // namespace __reduce_by_key
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred,
+          class BinaryOp>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output,
+              BinaryPred                 binary_pred,
+              BinaryOp                   binary_op)
 {
-namespace cuda
+  auto ret = thrust::make_pair(keys_output, values_output);
+  THRUST_CDP_DISPATCH((ret = __reduce_by_key::reduce_by_key(policy,
+                                                            keys_first,
+                                                            keys_last,
+                                                            values_first,
+                                                            keys_output,
+                                                            values_output,
+                                                            binary_pred,
+                                                            binary_op);),
+                      (ret =
+                         thrust::reduce_by_key(cvt_to_seq(derived_cast(policy)),
+                                               keys_first,
+                                               keys_last,
+                                               values_first,
+                                               keys_output,
+                                               values_output,
+                                               binary_pred,
+                                               binary_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output,
+              BinaryPred                 binary_pred)
 {
-namespace detail
+  typedef typename thrust::detail::eval_if<
+    thrust::detail::is_output_iterator<ValOutputIt>::value,
+    thrust::iterator_value<ValInputIt>,
+    thrust::iterator_value<ValOutputIt>
+  >::type value_type;
+  return cuda_cub::reduce_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              keys_output,
+                              values_output,
+                              binary_pred,
+                              plus<value_type>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+reduce_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              KeyOutputIt                keys_output,
+              ValOutputIt                values_output)
 {
+  typedef typename thrust::iterator_value<KeyInputIt>::type KeyT;
+  return cuda_cub::reduce_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              keys_output,
+                              values_output,
+                              equal_to<KeyT>());
+}
+
+} // namespace cuda_
 
+THRUST_NAMESPACE_END
 
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-__host__ __device__
-thrust::pair<OutputIterator1,OutputIterator2>
-  reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                InputIterator1 keys_first, 
-                InputIterator1 keys_last,
-                InputIterator2 values_first,
-                OutputIterator1 keys_output,
-                OutputIterator2 values_output,
-                BinaryPredicate binary_pred,
-                BinaryFunction binary_op);
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/reduce_by_key.inl>
+#include <thrust/memory.h>
+#include <thrust/reduce.h>
 
+#endif
diff --git a/thrust/system/cuda/detail/reduce_by_key.inl b/thrust/system/cuda/detail/reduce_by_key.inl
deleted file mode 100644
index 60c2756d4..000000000
--- a/thrust/system/cuda/detail/reduce_by_key.inl
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-#include <thrust/detail/config.h>
-#include <thrust/reduce.h>
-#include <thrust/detail/seq.h>
-#include <thrust/system/cuda/detail/reduce_by_key.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/range/head_flags.h>
-#include <thrust/detail/range/tail_flags.h>
-#include <thrust/system/cuda/detail/reduce_intervals.hpp>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace reduce_by_key_detail
-{
-
-
-struct reduce_by_key_kernel
-{
-  template<typename ConcurrentGroup,
-           typename RandomAccessIterator1,
-           typename Decomposition,
-           typename RandomAccessIterator2,
-           typename RandomAccessIterator3,
-           typename RandomAccessIterator4,
-           typename RandomAccessIterator5,
-           typename RandomAccessIterator6,
-           typename RandomAccessIterator7,
-           typename BinaryPredicate,
-           typename BinaryFunction>
-  __device__
-  thrust::pair<RandomAccessIterator3,RandomAccessIterator4>
-  operator()(ConcurrentGroup &g,
-             RandomAccessIterator1 keys_first,
-             Decomposition decomp,
-             RandomAccessIterator2 values_first,
-             RandomAccessIterator3 keys_result,
-             RandomAccessIterator4 values_result,
-             RandomAccessIterator5 interval_output_offsets,
-             RandomAccessIterator6 interval_values,
-             RandomAccessIterator7 is_carry,
-             //BinaryPredicate pred,
-             //BinaryFunction binary_op)
-             thrust::tuple<BinaryPredicate,BinaryFunction> pred_and_binary_op)
-  {
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type key_type;
-    typedef typename thrust::iterator_value<RandomAccessIterator2>::type value_type;
-
-    BinaryPredicate pred = thrust::get<0>(pred_and_binary_op);
-    BinaryFunction binary_op = thrust::get<1>(pred_and_binary_op);
-
-    thrust::detail::tail_flags<RandomAccessIterator1,BinaryPredicate> tail_flags(keys_first, keys_first + decomp.n(), pred);
-
-    typename Decomposition::size_type input_first, input_last;
-    thrust::tie(input_first,input_last) = decomp[g.index()];
-
-    typename Decomposition::size_type output_first = g.index() == 0 ? 0 : interval_output_offsets[g.index() - 1];
-
-    key_type init_key     = keys_first[input_first];
-    value_type init_value = values_first[input_first];
-
-    // the inits become the carries
-    thrust::tie(keys_result, values_result, init_key, init_value) =
-      bulk_::reduce_by_key(g,
-                           keys_first + input_first + 1,
-                           keys_first + input_last,
-                           values_first + input_first + 1,
-                           keys_result + output_first,
-                           values_result + output_first,
-                           init_key,
-                           init_value,
-                           pred,
-                           binary_op);
-
-    if(g.this_exec.index() == 0)
-    {
-      bool interval_has_carry = !tail_flags[input_last-1];
-
-      if(interval_has_carry)
-      {
-        interval_values[g.index()] = init_value;
-      } // end if
-      else
-      {
-        *keys_result   = init_key;
-        *values_result = init_value;
-
-        ++keys_result;
-        ++values_result;
-      } // end else
-
-      is_carry[g.index()] = interval_has_carry;
-    } // end if
-
-    return thrust::make_pair(keys_result, values_result);
-  }
-
-
-  template<typename ConcurrentGroup,
-           typename RandomAccessIterator1,
-           typename RandomAccessIterator2,
-           typename RandomAccessIterator3,
-           typename RandomAccessIterator4,
-           typename BinaryPredicate,
-           typename BinaryFunction,
-           typename Iterator>
-  __device__
-  void operator()(ConcurrentGroup      &g,
-                  RandomAccessIterator1 keys_first,
-                  RandomAccessIterator1 keys_last,
-                  RandomAccessIterator2 values_first,
-                  RandomAccessIterator3 keys_result,
-                  RandomAccessIterator4 values_result,
-                  BinaryPredicate       pred,
-                  BinaryFunction        binary_op,
-                  Iterator result_size)
-  {
-    RandomAccessIterator3 old_keys_result = keys_result;
-
-    thrust::tie(keys_result, values_result) =
-      operator()(g, keys_first, make_trivial_decomposition(keys_last - keys_first), values_first, keys_result, values_result,
-                 thrust::make_constant_iterator<int>(0),
-                 thrust::make_discard_iterator(),
-                 thrust::make_discard_iterator(),
-                 thrust::make_tuple(pred,binary_op));
-
-    if(g.this_exec.index() == 0)
-    {
-      *result_size = keys_result - old_keys_result;
-    }
-  }
-};
-
-
-struct tuple_and
-{
-  typedef bool result_type;
-
-  template<typename Tuple>
-  __host__ __device__
-  bool operator()(Tuple t)
-  {
-    return thrust::get<0>(t) && thrust::get<1>(t);
-  }
-};
-
-
-template<typename DerivedPolicy,
-         typename Iterator1,
-         typename Iterator2,
-         typename Iterator3,
-         typename Iterator4,
-         typename BinaryFunction>
-__host__ __device__
-void sum_tail_carries(execution_policy<DerivedPolicy> &exec,
-                      Iterator1 interval_values_first,
-                      Iterator1 interval_values_last,
-                      Iterator2 interval_output_offsets_first,
-                      Iterator2 interval_output_offsets_last,
-                      Iterator3 is_carry,
-                      Iterator4 values_result,
-                      BinaryFunction binary_op)
-{
-  typedef thrust::zip_iterator<thrust::tuple<Iterator2,Iterator3> > zip_iterator;
-
-  thrust::detail::tail_flags<zip_iterator> tail_flags(thrust::make_zip_iterator(thrust::make_tuple(interval_output_offsets_first, is_carry)),
-                                                      thrust::make_zip_iterator(thrust::make_tuple(interval_output_offsets_last,  is_carry)));
-
-  // for each value in the array of interval values
-  //   if it is a carry and it is the tail value in its segment
-  //     scatter it to its location in the output array, but sum it together with the value there previously
-  thrust::transform_if(exec,
-                       interval_values_first, interval_values_last,
-                       thrust::make_permutation_iterator(values_result, interval_output_offsets_first),
-                       thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(tail_flags.begin(), is_carry)), tuple_and()),
-                       thrust::make_permutation_iterator(values_result, interval_output_offsets_first),
-                       binary_op,
-                       thrust::identity<bool>());
-} // end sum_tail_carries()
-
-
-template<typename InputIterator, typename OutputIterator, typename BinaryFunction>
-struct intermediate_type
-  : thrust::detail::eval_if<
-    thrust::detail::has_result_type<BinaryFunction>::value,
-    thrust::detail::result_type<BinaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >
-{};
-
-
-template<typename Size,
-         typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-__host__ __device__
-thrust::pair<OutputIterator1,OutputIterator2>
-reduce_by_key(execution_policy<DerivedPolicy> &exec,
-              InputIterator1 keys_first, 
-              InputIterator1 keys_last,
-              InputIterator2 values_first,
-              OutputIterator1 keys_result,
-              OutputIterator2 values_result,
-              BinaryPredicate binary_pred,
-              BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
-  typedef typename thrust::iterator_value<InputIterator2>::type      value_type;
-  typedef Size size_type;
-
-  const difference_type n = keys_last - keys_first;
-
-  if(n <= 0) return thrust::make_pair(keys_result, values_result);
-
-  const size_type threshold_of_parallelism = 20000;
-
-  if(n <= threshold_of_parallelism)
-  {
-    thrust::detail::temporary_array<size_type,DerivedPolicy> result_size_storage(exec, 1);
-
-    // XXX these sizes aren't actually optimal, but anything larger
-    //     will cause sm_1x to run out of smem at compile time
-    // XXX all of this grossness would go away if we could rely on shmalloc
-    const int groupsize =
-      (sizeof(value_type) <=     sizeof(int)) ? 512 :
-      (sizeof(value_type) <= 2 * sizeof(int)) ? 256 :
-      128;
-
-    const int grainsize = (sizeof(value_type) == sizeof(int)) ? 3 : 5;
-
-    size_type heap_size = groupsize * grainsize * (sizeof(size_type) + sizeof(value_type));
-    bulk_::async(bulk_::grid<groupsize,grainsize>(1,heap_size,stream(thrust::detail::derived_cast(exec))), reduce_by_key_detail::reduce_by_key_kernel(),
-      bulk_::root.this_exec, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op, result_size_storage.begin());
-
-    size_type result_size = result_size_storage[0];
-
-    return thrust::make_pair(keys_result + result_size, values_result + result_size);
-  } // end if
-
-  typedef typename reduce_by_key_detail::intermediate_type<
-    InputIterator2, OutputIterator2, BinaryFunction
-  >::type intermediate_type;
-
-  const size_type groupsize = 128;
-  const size_type grainsize = 5;
-  size_type tile_size = groupsize * grainsize;
-
-  const size_type interval_size = threshold_of_parallelism; 
-
-  size_type subscription = 100;
-  size_type num_groups = thrust::min<size_type>(subscription * bulk_::concurrent_group<>::hardware_concurrency(), (n + interval_size - 1) / interval_size);
-  aligned_decomposition<size_type> decomp(n, num_groups, tile_size);
-
-  // count the number of tail flags in each interval
-  thrust::detail::tail_flags<
-    InputIterator1,
-    BinaryPredicate,
-    size_type
-  > tail_flags(keys_first, keys_last, binary_pred);
-
-  thrust::detail::temporary_array<size_type,DerivedPolicy> interval_output_offsets(exec, decomp.size());
-
-  reduce_intervals_(exec, tail_flags.begin(), decomp, interval_output_offsets.begin(), thrust::plus<size_type>());
-
-  // scan the interval counts
-  thrust::inclusive_scan(exec, interval_output_offsets.begin(), interval_output_offsets.end(), interval_output_offsets.begin());
-
-  // reduce each interval
-  thrust::detail::temporary_array<bool,DerivedPolicy> is_carry(exec, decomp.size());
-  thrust::detail::temporary_array<intermediate_type,DerivedPolicy> interval_values(exec, decomp.size());
-
-  size_type heap_size = tile_size * (sizeof(size_type) + sizeof(value_type));
-  bulk_::async(bulk_::grid<groupsize,grainsize>(decomp.size(),heap_size,stream(thrust::detail::derived_cast(exec))), reduce_by_key_detail::reduce_by_key_kernel(),
-    bulk_::root.this_exec, keys_first, decomp, values_first, keys_result, values_result, interval_output_offsets.begin(), interval_values.begin(), is_carry.begin(), thrust::make_tuple(binary_pred, binary_op)
-  );
-
-  // scan by key the carries
-  thrust::inclusive_scan_by_key(exec,
-                                thrust::make_zip_iterator(thrust::make_tuple(interval_output_offsets.begin(), is_carry.begin())),
-                                thrust::make_zip_iterator(thrust::make_tuple(interval_output_offsets.end(),   is_carry.end())),
-                                interval_values.begin(),
-                                interval_values.begin(),
-                                thrust::equal_to<thrust::tuple<size_type,bool> >(),
-                                binary_op);
-
-  // sum each tail carry value into the result 
-  reduce_by_key_detail::sum_tail_carries(exec,
-                                         interval_values.begin(), interval_values.end(),
-                                         interval_output_offsets.begin(), interval_output_offsets.end(),
-                                         is_carry.begin(),
-                                         values_result,
-                                         binary_op);
-
-  difference_type result_size = interval_output_offsets[interval_output_offsets.size() - 1];
-
-  return thrust::make_pair(keys_result + result_size, values_result + result_size);
-} // end reduce_by_key()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-__host__ __device__
-thrust::pair<OutputIterator1,OutputIterator2>
-reduce_by_key(execution_policy<DerivedPolicy> &exec,
-              InputIterator1 keys_first, 
-              InputIterator1 keys_last,
-              InputIterator2 values_first,
-              OutputIterator1 keys_result,
-              OutputIterator2 values_result,
-              BinaryPredicate binary_pred,
-              BinaryFunction binary_op)
-{
-  thrust::pair<OutputIterator1,OutputIterator2> result(keys_result, values_result);
-
-  typedef typename thrust::iterator_difference<InputIterator1>::type difference_type;
-
-  // opportunistically use a narrower type for counting when possible 
-  // this is a significant performance optimization in the range of 10-15%
-  if(keys_last - keys_first <= static_cast<difference_type>(UINT_MAX))
-  {
-    result = reduce_by_key_detail::reduce_by_key<unsigned int>(exec,
-                                                               keys_first, keys_last,
-                                                               values_first,
-                                                               keys_result,
-                                                               values_result,
-                                                               binary_pred,
-                                                               binary_op);
-  } // end if
-  else
-  {
-    result = reduce_by_key_detail::reduce_by_key<difference_type>(exec,
-                                                                  keys_first, keys_last,
-                                                                  values_first,
-                                                                  keys_result,
-                                                                  values_result,
-                                                                  binary_pred,
-                                                                  binary_op);
-  } // end else
-
-  return result;
-} // end reduce_by_key()
-
-
-} // end namespace reduce_by_key_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator1,
-         typename InputIterator2,
-         typename OutputIterator1,
-         typename OutputIterator2,
-         typename BinaryPredicate,
-         typename BinaryFunction>
-__host__ __device__
-thrust::pair<OutputIterator1,OutputIterator2>
-reduce_by_key(execution_policy<DerivedPolicy> &exec,
-              InputIterator1 keys_first, 
-              InputIterator1 keys_last,
-              InputIterator2 values_first,
-              OutputIterator1 keys_result,
-              OutputIterator2 values_result,
-              BinaryPredicate binary_pred,
-              BinaryFunction binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    static __host__ __device__
-    thrust::pair<OutputIterator1,OutputIterator2>
-    parallel_path(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first,
-                  InputIterator1 keys_last,
-                  InputIterator2 values_first,
-                  OutputIterator1 keys_result,
-                  OutputIterator2 values_result,
-                  BinaryPredicate binary_pred,
-                  BinaryFunction binary_op)
-    {
-      return thrust::system::cuda::detail::reduce_by_key_detail::reduce_by_key(exec, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
-    }
-
-    static __host__ __device__
-    thrust::pair<OutputIterator1,OutputIterator2>
-    sequential_path(execution_policy<DerivedPolicy> &,
-                    InputIterator1 keys_first,
-                    InputIterator1 keys_last,
-                    InputIterator2 values_first,
-                    OutputIterator1 keys_result,
-                    OutputIterator2 values_result,
-                    BinaryPredicate binary_pred,
-                    BinaryFunction binary_op)
-    {
-      return thrust::reduce_by_key(thrust::seq, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
-#else
-  return workaround::sequential_path(exec, keys_first, keys_last, values_first, keys_result, values_result, binary_pred, binary_op);
-#endif
-} // end reduce_by_key()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/reduce_intervals.h b/thrust/system/cuda/detail/reduce_intervals.h
deleted file mode 100644
index 20c600f0e..000000000
--- a/thrust/system/cuda/detail/reduce_intervals.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file reduce_intervals.h
- *  \brief CUDA implementations of reduce_intervals algorithms.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction,
-         typename Decomposition>
-__host__ __device__
-void reduce_intervals(execution_policy<DerivedPolicy> &exec,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp);
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/reduce_intervals.inl>
-
diff --git a/thrust/system/cuda/detail/reduce_intervals.hpp b/thrust/system/cuda/detail/reduce_intervals.hpp
deleted file mode 100644
index d91b20460..000000000
--- a/thrust/system/cuda/detail/reduce_intervals.hpp
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/system/cuda/detail/decomposition.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/system/cuda/detail/reduce_intervals.hpp>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace reduce_intervals_detail
-{
-
-
-struct reduce_intervals_kernel
-{
-  template<std::size_t groupsize, std::size_t grainsize, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename BinaryFunction>
-  __device__ void operator()(bulk_::concurrent_group<bulk_::agent<grainsize>,groupsize> &this_group,
-                             RandomAccessIterator1 first,
-                             Decomposition decomp,
-                             RandomAccessIterator2 result,
-                             BinaryFunction binary_op)
-  {
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-
-    typename Decomposition::range rng = decomp[this_group.index()];
-
-    value_type init = first[rng.second-1];
-
-    value_type sum = bulk_::reduce(this_group, first + rng.first, first + rng.second - 1, init, binary_op);
-
-    if(this_group.this_exec.index() == 0)
-    {
-      result[this_group.index()] = sum;
-    } // end if
-  } // end operator()
-}; // end reduce_intervals_kernel
-
-
-} // end reduce_intervals_detail
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename BinaryFunction>
-__host__ __device__
-RandomAccessIterator2 reduce_intervals_(execution_policy<DerivedPolicy> &exec, RandomAccessIterator1 first, Decomposition decomp, RandomAccessIterator2 result, BinaryFunction binary_op)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator2>::type result_type;
-  const size_t groupsize = 128;
-  size_t heap_size = groupsize * sizeof(result_type);
-  bulk_::async(bulk_::grid<groupsize,7>(decomp.size(),heap_size,stream(thrust::detail::derived_cast(exec))), reduce_intervals_detail::reduce_intervals_kernel(), bulk_::root.this_exec, first, decomp, result, binary_op);
-
-  return result + decomp.size();
-} // end reduce_intervals()
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size, typename RandomAccessIterator2, typename BinaryFunction>
-__host__ __device__
-RandomAccessIterator2 reduce_intervals_(execution_policy<DerivedPolicy> &exec, RandomAccessIterator1 first, RandomAccessIterator1 last, Size interval_size, RandomAccessIterator2 result, BinaryFunction binary_op)
-{
-  return thrust::system::cuda::detail::reduce_intervals_(exec, first, make_blocked_decomposition<Size>(last - first,interval_size), result, binary_op);
-} // end reduce_intervals()
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/reduce_intervals.inl b/thrust/system/cuda/detail/reduce_intervals.inl
deleted file mode 100644
index bd1417ac5..000000000
--- a/thrust/system/cuda/detail/reduce_intervals.inl
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-
-#include <thrust/iterator/iterator_traits.h>
-
-#include <thrust/detail/minmax.h>
-#include <thrust/system/detail/internal/decompose.h>
-#include <thrust/system/cuda/detail/extern_shared_ptr.h>
-#include <thrust/system/cuda/detail/block/reduce.h>
-#include <thrust/system/cuda/detail/detail/launch_closure.h>
-#include <thrust/system/cuda/detail/detail/launch_calculator.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction,
-         typename Decomposition,
-         typename Context>
-struct commutative_reduce_intervals_closure
-{
-  InputIterator  input;
-  OutputIterator output;
-  BinaryFunction binary_op;
-  Decomposition  decomposition;
-  unsigned int shared_array_size;
-
-  typedef Context context_type;
-  context_type context;
-
-  __host__ __device__
-  commutative_reduce_intervals_closure(InputIterator input, OutputIterator output, BinaryFunction binary_op, Decomposition decomposition, unsigned int shared_array_size, Context context = Context())
-    : input(input), output(output), binary_op(binary_op), decomposition(decomposition), shared_array_size(shared_array_size), context(context) {}
-
-  __device__ __thrust_forceinline__
-  void operator()(void)
-  {
-    typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-    extern_shared_ptr<OutputType>  shared_array;
-
-    typedef typename Decomposition::index_type index_type;
-   
-    // this block processes results in [range.begin(), range.end())
-    thrust::system::detail::internal::index_range<index_type> range = decomposition[context.block_index()];
-
-    index_type i = range.begin() + context.thread_index();
-      
-    input += i;
-
-    if(range.size() < context.block_dimension())
-    {
-      // compute reduction with the first shared_array_size threads
-      if(context.thread_index() < thrust::min<index_type>(shared_array_size,range.size()))
-      {
-        OutputType sum = *input;
-
-        i     += shared_array_size;
-        input += shared_array_size;
-
-        while(i < range.end())
-        {
-          OutputType val = *input;
-
-          sum = binary_op(sum, val);
-
-          i      += shared_array_size;
-          input  += shared_array_size;
-        }
-
-        shared_array[context.thread_index()] = sum;  
-      }
-    }
-    else
-    {
-      // compute reduction with all blockDim.x threads
-      OutputType sum = *input;
-
-      i     += context.block_dimension();
-      input += context.block_dimension();
-
-      while(i < range.end())
-      {
-        OutputType val = *input;
-
-        sum = binary_op(sum, val);
-
-        i      += context.block_dimension();
-        input  += context.block_dimension();
-      }
-
-      // write first shared_array_size values into shared memory
-      if(context.thread_index() < shared_array_size)
-      {
-        shared_array[context.thread_index()] = sum;  
-      }
-
-      // accumulate remaining values (if any) to shared memory in stages
-      if(context.block_dimension() > shared_array_size)
-      {
-        unsigned int lb = shared_array_size;
-        unsigned int ub = shared_array_size + lb;
-        
-        while(lb < context.block_dimension())
-        {
-          context.barrier();
-
-          if(lb <= context.thread_index() && context.thread_index() < ub)
-          {
-            OutputType tmp = shared_array[context.thread_index() - lb];
-            shared_array[context.thread_index() - lb] = binary_op(tmp, sum);
-          }
-
-          lb += shared_array_size;
-          ub += shared_array_size;
-        }
-      }
-    }
-  
-    context.barrier();
-
-    block::reduce_n(context, shared_array, thrust::min<index_type>(range.size(), shared_array_size), binary_op);
-  
-    if(context.thread_index() == 0)
-    {
-      output += context.block_index();
-      *output = shared_array[0];
-    }
-  }
-};
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_BEGIN
-
-
-template<typename ExecutionPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename BinaryFunction,
-         typename Decomposition>
-__host__ __device__
-void reduce_intervals(execution_policy<ExecutionPolicy> &exec,
-                      InputIterator input,
-                      OutputIterator output,
-                      BinaryFunction binary_op,
-                      Decomposition decomp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  if(decomp.size() == 0)
-  {
-    return;
-  }
-  
-  // TODO if (decomp.size() > deviceProperties.maxGridSize[0]) throw cuda exception (or handle general case)
-
-  typedef detail::blocked_thread_array Context;
-  typedef commutative_reduce_intervals_closure<InputIterator,OutputIterator,BinaryFunction,Decomposition,Context> Closure;
-  typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
-  
-  detail::launch_calculator<Closure> calculator;
-
-  thrust::tuple<size_t,size_t,size_t> config = calculator.with_variable_block_size_available_smem();
-
-  //size_t max_blocks = thrust::get<0>(config);
-  size_t block_size = thrust::get<1>(config);
-  size_t max_memory = thrust::get<2>(config);
-
-  // determine shared array size
-  size_t shared_array_size  = thrust::min(max_memory / sizeof(OutputType), block_size);
-  size_t shared_array_bytes = sizeof(OutputType) * shared_array_size;
-  
-  // TODO if (shared_array_size < 1) throw cuda exception "insufficient shared memory"
-
-  Closure closure(input, output, binary_op, decomp, shared_array_size);
-  detail::launch_closure(exec, closure, decomp.size(), block_size, shared_array_bytes);
-}
-
-
-__THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING_END
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/remove.h b/thrust/system/cuda/detail/remove.h
index c6ae90664..836d8f5ea 100644
--- a/thrust/system/cuda/detail/remove.h
+++ b/thrust/system/cuda/detail/remove.h
@@ -1,22 +1,134 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/copy_if.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+// in-place
+  
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class Predicate>
+InputIt __host__ __device__
+remove_if(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          StencilIt                  stencil,
+          Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, stencil, first,
+    thrust::detail::not1(predicate));
+}
+
+template <class Derived,
+          class InputIt,
+          class Predicate>
+InputIt __host__ __device__
+remove_if(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, first,
+    thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class T>
+InputIt __host__ __device__
+remove(execution_policy<Derived> &policy,
+       InputIt                    first,
+       InputIt                    last,
+       const T &                  value)
+{
+  using thrust::placeholders::_1;
+
+  return cuda_cub::remove_if(policy, first, last, _1 == value);
+}
+
+// copy
+
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class OutputIt,
+          class Predicate>
+OutputIt __host__ __device__
+remove_copy_if(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               StencilIt                  stencil,
+               OutputIt                   result,
+               Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, stencil, result,
+    thrust::detail::not1(predicate));
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class Predicate>
+OutputIt __host__ __device__
+remove_copy_if(execution_policy<Derived> &policy,
+               InputIt                    first,
+               InputIt                    last,
+               OutputIt                   result,
+               Predicate                  predicate)
+{
+  return cuda_cub::copy_if(policy, first, last, result,
+    thrust::detail::not1(predicate));
+}
+
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T>
+OutputIt __host__ __device__
+remove_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result,
+            const T &                  value)
+{
+  thrust::detail::equal_to_value<T> pred(value);
+  return cuda_cub::remove_copy_if(policy, first, last, result, pred);
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/replace.h b/thrust/system/cuda/detail/replace.h
index c6ae90664..af8b8fa95 100644
--- a/thrust/system/cuda/detail/replace.h
+++ b/thrust/system/cuda/detail/replace.h
@@ -1,22 +1,213 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/detail/internal_functional.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+  namespace __replace
+  {
+    template<class T>
+    struct constant_f
+    {
+      T value;
+
+      THRUST_FUNCTION
+      constant_f(T const &x) : value(x) {}
+
+      template<class U>
+      THRUST_DEVICE_FUNCTION
+      T operator()(U const &)  const
+      {
+        return value;
+      }
+    }; // struct constant_f
+
+    template<class Predicate, class NewType, class OutputType>
+    struct new_value_if_f
+    {
+      Predicate pred;
+      NewType new_value;
+
+      THRUST_FUNCTION
+      new_value_if_f(Predicate pred_, NewType new_value_)
+          : pred(pred_), new_value(new_value_) {}
+
+      template<class T>
+      OutputType THRUST_DEVICE_FUNCTION
+      operator()(T const &x)
+      {
+        return pred(x) ? new_value : x;
+      }
+
+      template<class T, class P>
+      OutputType THRUST_DEVICE_FUNCTION
+      operator()(T const &x, P const& y)
+      {
+        return pred(y) ? new_value : x;
+      }
+    }; // struct new_value_if_f
+
+  } // namespace __replace
+
+template <class Derived,
+          class Iterator,
+          class T>
+void __host__ __device__
+replace(execution_policy<Derived> &policy,
+        Iterator                   first,
+        Iterator                   last,
+        T const &                  old_value,
+        T const &                  new_value)
+{
+  using thrust::placeholders::_1;
+
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      _1 == old_value);
+}
+
+template <class Derived,
+          class Iterator,
+          class Predicate,
+          class T>
+void __host__ __device__
+replace_if(execution_policy<Derived> &policy,
+           Iterator                   first,
+           Iterator                   last,
+           Predicate                  pred,
+           T const &                  new_value)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      pred);
+}
+
+template <class Derived,
+          class Iterator,
+          class StencilIt,
+          class Predicate,
+          class T>
+void __host__ __device__
+replace_if(execution_policy<Derived> &policy,
+           Iterator                   first,
+           Iterator                   last,
+           StencilIt                  stencil,
+           Predicate                  pred,
+           T const &                  new_value)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      stencil,
+                      first,
+                      __replace::constant_f<T>(new_value),
+                      pred);
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class Predicate,
+          class T>
+OutputIt __host__ __device__
+replace_copy_if(execution_policy<Derived> &policy,
+                InputIt                    first,
+                InputIt                    last,
+                OutputIt                   result,
+                Predicate                  predicate,
+                T const &                  new_value)
+{
+  typedef typename iterator_traits<OutputIt>::value_type output_type;
+  typedef __replace::new_value_if_f<Predicate, T, output_type> new_value_if_t;
+  return cuda_cub::transform(policy,
+                             first,
+                             last,
+                             result,
+                             new_value_if_t(predicate, new_value));
+}
+
+template <class Derived,
+          class InputIt,
+          class StencilIt,
+          class OutputIt,
+          class Predicate,
+          class T>
+OutputIt __host__ __device__
+replace_copy_if(execution_policy<Derived> &policy,
+                InputIt                    first,
+                InputIt                    last,
+                StencilIt                  stencil,
+                OutputIt                   result,
+                Predicate                  predicate,
+                T const &                  new_value)
+{
+  typedef typename iterator_traits<OutputIt>::value_type output_type;
+  typedef __replace::new_value_if_f<Predicate, T, output_type> new_value_if_t;
+  return cuda_cub::transform(policy,
+                           first,
+                           last,
+                           stencil,
+                           result,
+                           new_value_if_t(predicate, new_value));
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class T>
+OutputIt __host__ __device__
+replace_copy(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             OutputIt                   result,
+             T const &                  old_value,
+             T const &                  new_value)
+{
+  return cuda_cub::replace_copy_if(policy,
+                                   first,
+                                   last,
+                                   result,
+                                   thrust::detail::equal_to_value<T>(old_value),
+                                   new_value);
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/reverse.h b/thrust/system/cuda/detail/reverse.h
index c6ae90664..7c4cb867e 100644
--- a/thrust/system/cuda/detail/reverse.h
+++ b/thrust/system/cuda/detail/reverse.h
@@ -1,22 +1,97 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived, class ItemsIt, class ResultIt>
+ResultIt __host__ __device__
+reverse_copy(execution_policy<Derived> &policy,
+             ItemsIt                    first,
+             ItemsIt                    last,
+             ResultIt                   result);
+
+template <class Derived, class ItemsIt>
+void __host__ __device__
+reverse(execution_policy<Derived> &policy,
+        ItemsIt                    first,
+        ItemsIt                    last);
+
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+
+#include <thrust/advance.h>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/iterator/reverse_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class ItemsIt,
+          class ResultIt>
+ResultIt __host__ __device__
+reverse_copy(execution_policy<Derived> &policy,
+             ItemsIt                    first,
+             ItemsIt                    last,
+             ResultIt                   result)
+{
+  return cuda_cub::copy(policy,
+                        thrust::make_reverse_iterator(last),
+                        thrust::make_reverse_iterator(first),
+                        result);
+}
+
+template <class Derived,
+          class ItemsIt>
+void __host__ __device__
+reverse(execution_policy<Derived> &policy,
+        ItemsIt                    first,
+        ItemsIt                    last)
+{
+  typedef typename thrust::iterator_difference<ItemsIt>::type difference_type;
+
+  // find the midpoint of [first,last)
+  difference_type N = thrust::distance(first, last);
+  ItemsIt mid(first);
+  thrust::advance(mid, N / 2);
+
+  cuda_cub::swap_ranges(policy, first, mid, thrust::make_reverse_iterator(last));
+}
+
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/runtime_introspection.h b/thrust/system/cuda/detail/runtime_introspection.h
deleted file mode 100644
index 624fdad50..000000000
--- a/thrust/system/cuda/detail/runtime_introspection.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file runtime_introspection.h
- *  \brief Defines the interface to functions
- *         providing introspection into the architecture
- *         of CUDA devices.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-
-// #include this for device_properties_t and function_attributes_t
-#include <thrust/system/cuda/detail/cuda_launch_config.h>
-
-// #include this for size_t
-#include <cstddef>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-/*! Returns the current device ordinal.
- */
-inline __host__ __device__
-int current_device();
-
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with a given device.
- */
-inline __host__ __device__
-device_properties_t device_properties(int device_id);
-
-
-/*! Returns a copy of the device_properties_t structure
- *  that is associated with the current device.
- */
-inline __host__ __device__
-device_properties_t device_properties();
-
-
-/*! Returns a copy of the function_attributes_t structure
- *  that is associated with a given __global__ function
- */
-template<typename KernelFunction>
-inline __host__ __device__
-function_attributes_t function_attributes(KernelFunction kernel);
-
-
-/*! Returns the compute capability of a device in integer format.
- *  For example, returns 10 for sm_10 and 21 for sm_21
- *  \return The compute capability as an integer
- */
-inline __host__ __device__
-size_t compute_capability(const device_properties_t &properties);
-
-
-/*! Returns the compute capability of the current device in integer format.
- *  For example, returns 10 for sm_10 and 21 for sm_21
- *  \return The compute capability as an integer
- */
-inline __host__ __device__
-size_t compute_capability();
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
-#include <thrust/system/cuda/detail/runtime_introspection.inl>
-
diff --git a/thrust/system/cuda/detail/runtime_introspection.inl b/thrust/system/cuda/detail/runtime_introspection.inl
deleted file mode 100644
index 219c81c9d..000000000
--- a/thrust/system/cuda/detail/runtime_introspection.inl
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/detail/util/blocking.h>
-#include <thrust/detail/minmax.h>
-#include <thrust/system/cuda/detail/throw_on_error.h>
-#include <cstdio>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace runtime_introspection_detail
-{
-
-
-__host__ __device__
-inline void uncached_device_properties(device_properties_t &p, int device_id)
-{
-#ifndef __CUDA_ARCH__
-  cudaDeviceProp properties;
-  
-  cudaError_t error = cudaGetDeviceProperties(&properties, device_id);
-  
-  throw_on_error(error, "cudaGetDeviceProperties in get_device_properties");
-
-  // be careful about how this is initialized!
-  device_properties_t temp = {
-    properties.major,
-    {
-      properties.maxGridSize[0],
-      properties.maxGridSize[1],
-      properties.maxGridSize[2]
-    },
-    properties.maxThreadsPerBlock,
-    properties.maxThreadsPerMultiProcessor,
-    properties.minor,
-    properties.multiProcessorCount,
-    properties.regsPerBlock,
-    properties.sharedMemPerBlock,
-    properties.warpSize
-  };
-
-  p = temp;
-#elif (__CUDA_ARCH__ >= 350)
-  cudaError_t error = cudaDeviceGetAttribute(&p.major,           cudaDevAttrComputeCapabilityMajor,      device_id);
-  error = cudaDeviceGetAttribute(&p.maxGridSize[0],              cudaDevAttrMaxGridDimX,                 device_id);
-  error = cudaDeviceGetAttribute(&p.maxGridSize[1],              cudaDevAttrMaxGridDimY,                 device_id);
-  error = cudaDeviceGetAttribute(&p.maxGridSize[2],              cudaDevAttrMaxGridDimZ,                 device_id);
-  error = cudaDeviceGetAttribute(&p.maxThreadsPerBlock,          cudaDevAttrMaxThreadsPerBlock,          device_id);
-  error = cudaDeviceGetAttribute(&p.maxThreadsPerMultiProcessor, cudaDevAttrMaxThreadsPerMultiProcessor, device_id);
-  error = cudaDeviceGetAttribute(&p.minor,                       cudaDevAttrComputeCapabilityMinor,      device_id);
-  error = cudaDeviceGetAttribute(&p.multiProcessorCount,         cudaDevAttrMultiProcessorCount,         device_id);
-  error = cudaDeviceGetAttribute(&p.regsPerBlock,                cudaDevAttrMaxRegistersPerBlock,        device_id);
-  int temp;
-  error = cudaDeviceGetAttribute(&temp,                          cudaDevAttrMaxSharedMemoryPerBlock,     device_id);
-  p.sharedMemPerBlock = temp;
-  error = cudaDeviceGetAttribute(&p.warpSize,                    cudaDevAttrWarpSize,                    device_id);
-
-  throw_on_error(error, "cudaDeviceGetProperty in get_device_properties");
-#else
-  // dunno how we can safely error here.
-#endif
-} // end get_device_properties()
-
-
-inline void cached_device_properties(device_properties_t &p, int device_id)
-{
-  // cache the result of get_device_properties, because it is slow
-  // only cache the first few devices
-  static const int max_num_devices                              = 16;
-
-  static bool properties_exist[max_num_devices]                 = {0};
-  static device_properties_t device_properties[max_num_devices] = {};
-
-  if(device_id >= max_num_devices)
-  {
-    uncached_device_properties(p, device_id);
-  }
-
-  if(!properties_exist[device_id])
-  {
-    uncached_device_properties(device_properties[device_id], device_id);
-
-    // disallow the compiler to move the write to properties_exist[device_id]
-    // before the initialization of device_properties[device_id]
-    __thrust_compiler_fence();
-    
-    properties_exist[device_id] = true;
-  }
-
-  p = device_properties[device_id];
-}
-
-
-} // end runtime_introspection_detail
-
-
-inline __host__ __device__
-device_properties_t device_properties(int device_id)
-{
-  device_properties_t result;
-#ifndef __CUDA_ARCH__
-  runtime_introspection_detail::cached_device_properties(result, device_id);
-#else
-  runtime_introspection_detail::uncached_device_properties(result, device_id);
-#endif
-  return result;
-}
-
-
-inline __host__ __device__
-int current_device()
-{
-  int result = -1;
-
-#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 350
-  cudaError_t error = cudaGetDevice(&result);
-
-  throw_on_error(error, "cudaGetDevice in current_device");
-
-  if(result < 0)
-  {
-    throw_on_error(cudaErrorNoDevice, "cudaGetDevice in current_device");
-  }
-#else
-  // dunno how to safely error here
-#endif
-
-  return result;
-}
-
-
-inline __host__ __device__
-device_properties_t device_properties()
-{
-  return device_properties(current_device());
-}
-
-
-template<typename KernelFunction>
-__host__ __device__
-inline function_attributes_t function_attributes(KernelFunction kernel)
-{
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
-  cudaFuncAttributes attributes;
-
-  typedef void (*fun_ptr_type)();
-
-  fun_ptr_type fun_ptr = reinterpret_cast<fun_ptr_type>(kernel);
-  throw_on_error(cudaFuncGetAttributes(&attributes, reinterpret_cast<void*>(fun_ptr)), "cudaFuncGetAttributes in function_attributes");
-
-  // be careful about how this is initialized!
-  function_attributes_t result = {
-    attributes.constSizeBytes,
-    attributes.localSizeBytes,
-    attributes.maxThreadsPerBlock,
-    attributes.numRegs,
-    attributes.ptxVersion,
-    attributes.sharedSizeBytes
-  };
-#else
-  function_attributes_t result = {0};
-#endif
-
-  return result;
-}
-
-
-inline __host__ __device__
-size_t compute_capability(const device_properties_t &properties)
-{
-  return 10 * properties.major + properties.minor;
-}
-
-
-inline __host__ __device__
-size_t compute_capability(void)
-{
-  return compute_capability(device_properties());
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/scan.h b/thrust/system/cuda/detail/scan.h
index 560f553ef..fdab8df84 100644
--- a/thrust/system/cuda/detail/scan.h
+++ b/thrust/system/cuda/detail/scan.h
@@ -1,69 +1,356 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
+#pragma once
 
+#include <thrust/detail/config.h>
 
-/*! \file scan.h
- *  \brief Scan operations (parallel prefix-sum) [cuda]
- */
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
 
-#pragma once
+#include <thrust/detail/config/exec_check_disable.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
+#include <cub/device/device_scan.cuh>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
 {
-namespace system
+namespace detail
 {
-namespace cuda
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan_n_impl(thrust::cuda_cub::execution_policy<Derived> &policy,
+                               InputIt first,
+                               Size num_items,
+                               OutputIt result,
+                               ScanOp scan_op)
 {
-namespace detail
+  using AccumT = typename thrust::iterator_traits<InputIt>::value_type;
+  using Dispatch32 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       cub::NullType,
+                                       thrust::detail::int32_t,
+                                       AccumT>;
+  using Dispatch64 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       cub::NullType,
+                                       thrust::detail::int64_t,
+                                       AccumT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary storage requirements:
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan");
+  }
+
+  // Run scan:
+  {
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching inclusive_scan kernel");
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
+                                     "inclusive_scan failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename InitValueT,
+          typename ScanOp>
+__host__ __device__
+OutputIt exclusive_scan_n_impl(thrust::cuda_cub::execution_policy<Derived> &policy,
+                               InputIt first,
+                               Size num_items,
+                               OutputIt result,
+                               InitValueT init,
+                               ScanOp scan_op)
+{
+  using InputValueT = cub::detail::InputValue<InitValueT>;
+  using Dispatch32 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       InputValueT,
+                                       thrust::detail::int32_t,
+                                       InitValueT>;
+  using Dispatch64 = cub::DispatchScan<InputIt,
+                                       OutputIt,
+                                       ScanOp,
+                                       InputValueT,
+                                       thrust::detail::int64_t,
+                                       InitValueT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status;
+
+  // Determine temporary storage requirements:
+  size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 InputValueT(init),
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan");
+  }
+
+  // Run scan:
+  {
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 first,
+                                 result,
+                                 scan_op,
+                                 InputValueT(init),
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after dispatching exclusive_scan kernel");
+    thrust::cuda_cub::throw_on_error(thrust::cuda_cub::synchronize_optional(policy),
+                                     "exclusive_scan failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+} // namespace detail
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan_n(thrust::cuda_cub::execution_policy<Derived> &policy,
+                          InputIt first,
+                          Size num_items,
+                          OutputIt result,
+                          ScanOp scan_op)
+{
+  THRUST_CDP_DISPATCH(
+    (result = thrust::cuda_cub::detail::inclusive_scan_n_impl(policy,
+                                                              first,
+                                                              num_items,
+                                                              result,
+                                                              scan_op);),
+    (result = thrust::inclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                     first,
+                                     first + num_items,
+                                     result,
+                                     scan_op);));
+  return result;
+}
+
+template <typename Derived, typename InputIt, typename OutputIt, typename ScanOp>
+__host__ __device__
+OutputIt inclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        ScanOp scan_op)
+{
+  using diff_t = typename thrust::iterator_traits<InputIt>::difference_type;
+  diff_t const num_items = thrust::distance(first, last);
+  return thrust::cuda_cub::inclusive_scan_n(policy,
+                                            first,
+                                            num_items,
+                                            result,
+                                            scan_op);
+}
+
+template <typename Derived, typename InputIt, typename OutputIt>
+__host__ __device__
+OutputIt inclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result)
 {
+  return thrust::cuda_cub::inclusive_scan(policy,
+                                          first,
+                                          last,
+                                          result,
+                                          thrust::plus<>{});
+}
 
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename InputIt,
+          typename Size,
+          typename OutputIt,
+          typename T,
+          typename ScanOp>
+__host__ __device__
+OutputIt exclusive_scan_n(thrust::cuda_cub::execution_policy<Derived> &policy,
+                          InputIt first,
+                          Size num_items,
+                          OutputIt result,
+                          T init,
+                          ScanOp scan_op)
+{
+  THRUST_CDP_DISPATCH(
+    (result = thrust::cuda_cub::detail::exclusive_scan_n_impl(policy,
+                                                              first,
+                                                              num_items,
+                                                              result,
+                                                              init,
+                                                              scan_op);),
+    (result = thrust::exclusive_scan(cvt_to_seq(derived_cast(policy)),
+                                     first,
+                                     first + num_items,
+                                     result,
+                                     init,
+                                     scan_op);));
+  return result;
+}
 
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
+template <typename Derived,
+          typename InputIt,
+          typename OutputIt,
+          typename T,
+          typename ScanOp>
 __host__ __device__
-OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              AssociativeOperator binary_op);
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        T init,
+                        ScanOp scan_op)
+{
+  using diff_t = typename thrust::iterator_traits<InputIt>::difference_type;
+  diff_t const num_items = thrust::distance(first, last);
+  return thrust::cuda_cub::exclusive_scan_n(policy,
+                                            first,
+                                            num_items,
+                                            result,
+                                            init,
+                                            scan_op);
+}
+
+template <typename Derived, typename InputIt, typename OutputIt, typename T>
 __host__ __device__
-OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              T init,
-                              AssociativeOperator binary_op);
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result,
+                        T init)
+{
+  return thrust::cuda_cub::exclusive_scan(policy,
+                                          first,
+                                          last,
+                                          result,
+                                          init,
+                                          thrust::plus<>{});
+}
 
+template <typename Derived, typename InputIt, typename OutputIt>
+__host__ __device__
+OutputIt exclusive_scan(thrust::cuda_cub::execution_policy<Derived> &policy,
+                        InputIt first,
+                        InputIt last,
+                        OutputIt result)
+{
+  using init_type = typename thrust::iterator_traits<InputIt>::value_type;
+  return cuda_cub::exclusive_scan(policy, first, last, result, init_type{});
+};
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
 
-#include <thrust/system/cuda/detail/scan.inl>
+#include <thrust/scan.h>
 
+#endif // NVCC
diff --git a/thrust/system/cuda/detail/scan.inl b/thrust/system/cuda/detail/scan.inl
deleted file mode 100644
index 4bcb09693..000000000
--- a/thrust/system/cuda/detail/scan.inl
+++ /dev/null
@@ -1,485 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file scan.inl
- *  \brief Inline file for scan.h.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/scan.h>
-#include <thrust/detail/seq.h>
-#include <thrust/detail/temporary_array.h>
-#include <thrust/detail/type_traits/function_traits.h>
-#include <thrust/system/cuda/detail/decomposition.h>
-#include <thrust/system/cuda/detail/bulk.h>
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace scan_detail
-{
-
-
-struct inclusive_scan_n
-{
-  template<typename ConcurrentGroup, typename InputIterator, typename Size, typename OutputIterator, typename T, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op)
-  {
-    bulk_::inclusive_scan(this_group, first, first + n, result, init, binary_op);
-  }
-
-
-  template<typename ConcurrentGroup, typename InputIterator, typename Size, typename OutputIterator, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, BinaryFunction binary_op)
-  {
-    bulk_::inclusive_scan(this_group, first, first + n, result, binary_op);
-  }
-};
-
-
-struct exclusive_scan_n
-{
-  template<typename ConcurrentGroup, typename InputIterator, typename Size, typename OutputIterator, typename T, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group, InputIterator first, Size n, OutputIterator result, T init, BinaryFunction binary_op)
-  {
-    bulk_::exclusive_scan(this_group, first, first + n, result, init, binary_op);
-  }
-};
-
-
-struct inclusive_downsweep
-{
-  template<typename ConcurrentGroup, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename RandomAccessIterator3, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group,
-                             RandomAccessIterator1 first,
-                             Decomposition decomp,
-                             RandomAccessIterator2 carries_first,
-                             RandomAccessIterator3 result,
-                             BinaryFunction binary_op)
-  {
-    typename Decomposition::range range = decomp[this_group.index()];
-  
-    RandomAccessIterator1 last = first + range.second;
-    first += range.first;
-    result += range.first;
-  
-    if(this_group.index() == 0)
-    {
-      bulk_::inclusive_scan(this_group, first, last, result, binary_op);
-    }
-    else
-    {
-      typename thrust::iterator_value<RandomAccessIterator2>::type carry = carries_first[this_group.index() - 1];
-
-      bulk_::inclusive_scan(this_group, first, last, result, carry, binary_op);
-    }
-  }
-};
-
-
-struct exclusive_downsweep
-{
-  template<typename ConcurrentGroup, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename RandomAccessIterator3, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group,
-                             RandomAccessIterator1 first,
-                             Decomposition decomp,
-                             RandomAccessIterator2 carries_first,
-                             RandomAccessIterator3 result,
-                             BinaryFunction binary_op)
-  {
-    typename Decomposition::range range = decomp[this_group.index()];
-  
-    RandomAccessIterator1 last = first + range.second;
-    first += range.first;
-    result += range.first;
-  
-    typename thrust::iterator_value<RandomAccessIterator2>::type carry = carries_first[this_group.index()];
-
-    bulk_::exclusive_scan(this_group, first, last, result, carry, binary_op);
-  }
-};
-
-
-template<typename T> struct accumulate_tiles_tuning_impl;
-
-
-template<> struct accumulate_tiles_tuning_impl<int>
-{
-  // determined from empirical testing on k20c & nvcc 6.5 RC
-  static const int groupsize = 128;
-  static const int grainsize = 9;
-};
-
-
-template<> struct accumulate_tiles_tuning_impl<double>
-{
-  // determined from empirical testing on k20c & nvcc 6.5 RC
-  static const int groupsize = 128;
-  static const int grainsize = 9;
-};
-
-
-// determined from empirical testing on k20c
-template<typename T>
-  struct accumulate_tiles_tuning
-{
-  static const int groupsize =
-    sizeof(T) <=     sizeof(int) ? accumulate_tiles_tuning_impl<int>::groupsize :
-    sizeof(T) <= 2 * sizeof(int) ? accumulate_tiles_tuning_impl<double>::groupsize :
-    128;
-  
-  static const int grainsize =
-    sizeof(T) <=     sizeof(int) ? accumulate_tiles_tuning_impl<int>::grainsize :
-    sizeof(T) <= 2 * sizeof(int) ? accumulate_tiles_tuning_impl<double>::grainsize :
-    3;
-};
-
-// this specialization accomodates scan_by_key,
-// whose intermediate type is a tuple
-template<typename T1, typename T2>
-  struct accumulate_tiles_tuning<thrust::tuple<T1,T2> >
-{
-  // determined from empirical testing on k20c
-  static const int groupsize = 128;
-  static const int grainsize = ((sizeof(T1) + sizeof(T2)) <= (2 * sizeof(double))) ? 5 : 3;
-};
-
-
-
-
-
-struct accumulate_tiles
-{
-  template<typename ConcurrentGroup, typename RandomAccessIterator1, typename Decomposition, typename RandomAccessIterator2, typename BinaryFunction>
-  __device__ void operator()(ConcurrentGroup &this_group,
-                             RandomAccessIterator1 first,
-                             Decomposition decomp,
-                             RandomAccessIterator2 result,
-                             BinaryFunction binary_op)
-  {
-    typedef typename thrust::iterator_value<RandomAccessIterator1>::type value_type;
-    
-    typename Decomposition::range range = decomp[this_group.index()];
-
-    const bool commutative = thrust::detail::is_commutative<BinaryFunction>::value;
-
-    // for a commutative accumulate, it's much faster to pass the last value as the init for some reason
-    value_type init = commutative ? first[range.second-1] : first[range.first];
-
-    value_type sum = commutative ?
-      bulk_::accumulate(this_group, first + range.first, first + range.second - 1, init, binary_op) :
-      bulk_::accumulate(this_group, first + range.first + 1, first + range.second, init, binary_op);
-
-    if(this_group.this_exec.index() == 0)
-    {
-      result[this_group.index()] = sum;
-    } // end if
-  } // end operator()
-}; // end accumulate_tiles
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-__host__ __device__
-OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              AssociativeOperator binary_op)
-{
-  typedef typename bulk_::detail::scan_detail::scan_intermediate<
-    InputIterator,
-    OutputIterator,
-    AssociativeOperator
-  >::type intermediate_type;
-
-  typedef typename thrust::iterator_difference<InputIterator>::type Size;
-
-  Size n = last - first;
-
-  cudaStream_t s = stream(thrust::detail::derived_cast(exec));
-  
-  const Size threshold_of_parallelism = 20000;
-
-  if(n < threshold_of_parallelism)
-  {
-    const Size groupsize =
-      sizeof(intermediate_type) <= 2 * sizeof(int) ? 512 :
-      sizeof(intermediate_type) <= 4 * sizeof(int) ? 256 :
-      128;
-
-    typedef bulk_::detail::scan_detail::scan_buffer<groupsize,3,InputIterator,OutputIterator,AssociativeOperator> heap_type;
-    Size heap_size = sizeof(heap_type);
-    bulk_::async(bulk_::grid<groupsize,3>(1, heap_size, s), scan_detail::inclusive_scan_n(), bulk_::root.this_exec, first, n, result, binary_op);
-
-    // XXX WAR unused variable warning
-    (void) groupsize;
-  } // end if
-  else
-  {
-    const Size groupsize = scan_detail::accumulate_tiles_tuning<intermediate_type>::groupsize;
-    const Size grainsize = scan_detail::accumulate_tiles_tuning<intermediate_type>::grainsize;
-
-    const Size tile_size = groupsize * grainsize;
-    Size num_tiles = (n + tile_size - 1) / tile_size;
-
-    // 20 determined from empirical testing on k20c & GTX 480
-    Size subscription = 20;
-    Size num_groups = thrust::min<Size>(subscription * bulk_::concurrent_group<>::hardware_concurrency(), num_tiles);
-
-    aligned_decomposition<Size> decomp(n, num_groups, tile_size);
-
-    thrust::detail::temporary_array<intermediate_type,DerivedPolicy> carries(exec, num_groups);
-    	
-    // Run the parallel raking reduce as an upsweep.
-    // n loads + num_groups stores
-    Size heap_size = groupsize * sizeof(intermediate_type);
-    bulk_::async(bulk_::grid<groupsize,grainsize>(num_groups,heap_size,s), scan_detail::accumulate_tiles(), bulk_::root.this_exec, first, decomp, carries.begin(), binary_op);
-
-    // scan the sums to get the carries
-    // num_groups loads + num_groups stores
-    const Size groupsize2 = sizeof(intermediate_type) <= 2 * sizeof(int) ? 256 : 128;
-    const Size grainsize2 = 3;
-    typedef bulk_::detail::scan_detail::scan_buffer<groupsize2,grainsize2,InputIterator,OutputIterator,AssociativeOperator> heap_type2;
-    heap_size = sizeof(heap_type2);
-    bulk_::async(bulk_::grid<groupsize2,grainsize2>(1,heap_size,s), scan_detail::inclusive_scan_n(), bulk_::root.this_exec, carries.begin(), num_groups, carries.begin(), binary_op);
-
-    // do the downsweep - n loads, n stores
-    typedef bulk_::detail::scan_detail::scan_buffer<
-      groupsize,
-      grainsize,
-      InputIterator,OutputIterator,AssociativeOperator
-    > heap_type3;
-    heap_size = sizeof(heap_type3);
-    bulk_::async(bulk_::grid<groupsize,grainsize>(num_groups,heap_size,s), scan_detail::inclusive_downsweep(), bulk_::root.this_exec, first, decomp, carries.begin(), result, binary_op);
-
-    // XXX WAR unused variable warnings
-    (void) groupsize2;
-    (void) grainsize2;
-  } // end else
-
-  return result + n;
-} // end inclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-__host__ __device__
-OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              T init,
-                              AssociativeOperator binary_op)
-{
-  typedef typename bulk_::detail::scan_detail::scan_intermediate<
-    InputIterator,
-    OutputIterator,
-    AssociativeOperator
-  >::type intermediate_type;
-
-  typedef typename thrust::iterator_difference<InputIterator>::type Size;
-
-  Size n = last - first;
-
-  cudaStream_t s = stream(thrust::detail::derived_cast(exec));
-  
-  const Size threshold_of_parallelism = 20000;
-
-  if(n < threshold_of_parallelism)
-  {
-    const Size groupsize =
-      sizeof(intermediate_type) <= 2 * sizeof(int) ? 512 :
-      sizeof(intermediate_type) <= 4 * sizeof(int) ? 256 :
-      128;
-
-    typedef bulk_::detail::scan_detail::scan_buffer<groupsize,3,InputIterator,OutputIterator,AssociativeOperator> heap_type;
-    Size heap_size = sizeof(heap_type);
-    bulk_::async(bulk_::grid<groupsize,3>(1, heap_size, s), scan_detail::exclusive_scan_n(), bulk_::root.this_exec, first, n, result, init, binary_op);
-
-    // XXX WAR unused variable warning
-    (void) groupsize;
-  } // end if
-  else
-  {
-    const Size groupsize = scan_detail::accumulate_tiles_tuning<intermediate_type>::groupsize;
-    const Size grainsize = scan_detail::accumulate_tiles_tuning<intermediate_type>::grainsize;
-
-    const Size tile_size = groupsize * grainsize;
-    Size num_tiles = (n + tile_size - 1) / tile_size;
-
-    // 20 determined from empirical testing on k20c & GTX 480
-    Size subscription = 20;
-    Size num_groups = thrust::min<Size>(subscription * bulk_::concurrent_group<>::hardware_concurrency(), num_tiles);
-
-    aligned_decomposition<Size> decomp(n, num_groups, tile_size);
-
-    thrust::detail::temporary_array<intermediate_type,DerivedPolicy> carries(exec, num_groups);
-    	
-    // Run the parallel raking reduce as an upsweep.
-    // n loads + num_groups stores
-    Size heap_size = groupsize * sizeof(intermediate_type);
-    bulk_::async(bulk_::grid<groupsize,grainsize>(num_groups,heap_size,s), scan_detail::accumulate_tiles(), bulk_::root.this_exec, first, decomp, carries.begin(), binary_op);
-    
-    // scan the sums to get the carries
-    // num_groups loads + num_groups stores
-    const Size groupsize2 = sizeof(intermediate_type) <= 2 * sizeof(int) ? 256 : 128;
-    const Size grainsize2 = 3;
-
-    typedef bulk_::detail::scan_detail::scan_buffer<groupsize2,grainsize2,InputIterator,OutputIterator,AssociativeOperator> heap_type2;
-    heap_size = sizeof(heap_type2);
-    bulk_::async(bulk_::grid<groupsize2,grainsize2>(1,heap_size,s), scan_detail::exclusive_scan_n(), bulk_::root.this_exec, carries.begin(), num_groups, carries.begin(), init, binary_op);
-
-    // do the downsweep - n loads, n stores
-    typedef bulk_::detail::scan_detail::scan_buffer<
-      groupsize,
-      grainsize,
-      InputIterator,OutputIterator,AssociativeOperator
-    > heap_type3;
-    heap_size = sizeof(heap_type3);
-    bulk_::async(bulk_::grid<groupsize,grainsize>(num_groups,heap_size,s), scan_detail::exclusive_downsweep(), bulk_::root.this_exec, first, decomp, carries.begin(), result, binary_op);
-
-    // XXX WAR unused variable warnings
-    (void) groupsize2;
-    (void) grainsize2;
-  } // end else
-
-  return result + n;
-} // end exclusive_scan()
-
-
-} // end scan_detail
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename AssociativeOperator>
-__host__ __device__
-OutputIterator inclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              AssociativeOperator binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static OutputIterator parallel_path(execution_policy<DerivedPolicy> &exec,
-                                        InputIterator first,
-                                        InputIterator last,
-                                        OutputIterator result,
-                                        AssociativeOperator binary_op)
-    {
-      return thrust::system::cuda::detail::scan_detail::inclusive_scan(exec, first, last, result, binary_op);
-    }
-
-    __host__ __device__
-    static OutputIterator sequential_path(execution_policy<DerivedPolicy> &,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          AssociativeOperator binary_op)
-    {
-      return thrust::inclusive_scan(thrust::seq, first, last, result, binary_op);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, last, result, binary_op);
-#else
-  return workaround::sequential_path(exec, first, last, result, binary_op);
-#endif
-} // end inclusive_scan()
-
-
-template<typename DerivedPolicy,
-         typename InputIterator,
-         typename OutputIterator,
-         typename T,
-         typename AssociativeOperator>
-__host__ __device__
-OutputIterator exclusive_scan(execution_policy<DerivedPolicy> &exec,
-                              InputIterator first,
-                              InputIterator last,
-                              OutputIterator result,
-                              T init,
-                              AssociativeOperator binary_op)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static OutputIterator parallel_path(execution_policy<DerivedPolicy> &exec,
-                                        InputIterator first,
-                                        InputIterator last,
-                                        OutputIterator result,
-                                        T init,
-                                        AssociativeOperator binary_op)
-    {
-      return thrust::system::cuda::detail::scan_detail::exclusive_scan(exec, first, last, result, init, binary_op);
-    }
-
-    __host__ __device__
-    static OutputIterator sequential_path(execution_policy<DerivedPolicy> &,
-                                          InputIterator first,
-                                          InputIterator last,
-                                          OutputIterator result,
-                                          T init,
-                                          AssociativeOperator binary_op)
-    {
-      return thrust::exclusive_scan(thrust::seq, first, last, result, init, binary_op);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first, last, result, init, binary_op);
-#else
-  return workaround::sequential_path(exec, first, last, result, init, binary_op);
-#endif
-} // end exclusive_scan()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/scan_by_key.h b/thrust/system/cuda/detail/scan_by_key.h
index c6ae90664..0407779c6 100644
--- a/thrust/system/cuda/detail/scan_by_key.h
+++ b/thrust/system/cuda/detail/scan_by_key.h
@@ -1,22 +1,492 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <thrust/iterator/iterator_traits.h>
+
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/temporary_array.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/dispatch.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/device/dispatch/dispatch_scan_by_key.cuh>
+#include <cub/util_type.cuh>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
+{
+namespace detail
+{
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename KeysInIt,
+          typename ValuesInIt,
+          typename ValuesOutIt,
+          typename EqualityOpT,
+          typename ScanOpT,
+          typename SizeT>
+__host__ __device__
+ValuesOutIt inclusive_scan_by_key_n(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  KeysInIt keys,
+  ValuesInIt values,
+  ValuesOutIt result,
+  SizeT num_items,
+  EqualityOpT equality_op,
+  ScanOpT scan_op)
+{
+  if (num_items == 0)
+  {
+    return result;
+  }
+
+  // Convert to raw pointers if possible:
+  using KeysInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<KeysInIt>;
+  using ValuesInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesInIt>;
+  using ValuesOutUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesOutIt>;
+  using AccumT = typename thrust::iterator_traits<ValuesInUnwrapIt>::value_type;
+
+  auto keys_unwrap = thrust::detail::try_unwrap_contiguous_iterator(keys);
+  auto values_unwrap = thrust::detail::try_unwrap_contiguous_iterator(values);
+  auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+  using Dispatch32 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            cub::NullType,
+                                            thrust::detail::int32_t,
+                                            AccumT>;
+  using Dispatch64 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            cub::NullType,
+                                            thrust::detail::int64_t,
+                                            AccumT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status{};
+
+  // Determine temporary storage requirements:
+  std::size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for inclusive_scan_by_key");
+  }
+
+  // Run scan:
+  {
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 cub::NullType{},
+                                 num_items_fixed,
+                                 stream));
+
+    thrust::cuda_cub::throw_on_error(
+      status, "after dispatching inclusive_scan_by_key kernel");
+
+    thrust::cuda_cub::throw_on_error(
+      thrust::cuda_cub::synchronize_optional(policy),
+      "inclusive_scan_by_key failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+__thrust_exec_check_disable__
+template <typename Derived,
+          typename KeysInIt,
+          typename ValuesInIt,
+          typename ValuesOutIt,
+          typename InitValueT,
+          typename EqualityOpT,
+          typename ScanOpT,
+          typename SizeT>
+__host__ __device__
+ValuesOutIt exclusive_scan_by_key_n(
+  thrust::cuda_cub::execution_policy<Derived>& policy,
+  KeysInIt keys,
+  ValuesInIt values,
+  ValuesOutIt result,
+  SizeT num_items,
+  InitValueT init_value,
+  EqualityOpT equality_op,
+  ScanOpT scan_op)
+{
+
+  if (num_items == 0)
+  {
+    return result;
+  }
+
+  // Convert to raw pointers if possible:
+  using KeysInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<KeysInIt>;
+  using ValuesInUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesInIt>;
+  using ValuesOutUnwrapIt =
+    thrust::detail::try_unwrap_contiguous_iterator_return_t<ValuesOutIt>;
+
+  auto keys_unwrap = thrust::detail::try_unwrap_contiguous_iterator(keys);
+  auto values_unwrap = thrust::detail::try_unwrap_contiguous_iterator(values);
+  auto result_unwrap = thrust::detail::try_unwrap_contiguous_iterator(result);
+
+  using Dispatch32 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            InitValueT,
+                                            thrust::detail::int32_t,
+                                            InitValueT>;
+  using Dispatch64 = cub::DispatchScanByKey<KeysInUnwrapIt,
+                                            ValuesInUnwrapIt,
+                                            ValuesOutUnwrapIt,
+                                            EqualityOpT,
+                                            ScanOpT,
+                                            InitValueT,
+                                            thrust::detail::int64_t,
+                                            InitValueT>;
+
+  cudaStream_t stream = thrust::cuda_cub::stream(policy);
+  cudaError_t status{};
+
+  // Determine temporary storage requirements:
+  std::size_t tmp_size = 0;
+  {
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (nullptr,
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 init_value,
+                                 num_items_fixed,
+                                 stream));
+    thrust::cuda_cub::throw_on_error(status,
+                                     "after determining tmp storage "
+                                     "requirements for exclusive_scan_by_key");
+  }
+
+  // Run scan:
+  {
+    // Allocate temporary storage:
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived> tmp{
+      policy,
+      tmp_size};
+
+    THRUST_INDEX_TYPE_DISPATCH2(status,
+                                Dispatch32::Dispatch,
+                                Dispatch64::Dispatch,
+                                num_items,
+                                (tmp.data().get(),
+                                 tmp_size,
+                                 keys_unwrap,
+                                 values_unwrap,
+                                 result_unwrap,
+                                 equality_op,
+                                 scan_op,
+                                 init_value,
+                                 num_items_fixed,
+                                 stream));
+
+    thrust::cuda_cub::throw_on_error(
+      status, "after dispatching exclusive_scan_by_key kernel");
+
+    thrust::cuda_cub::throw_on_error(
+      thrust::cuda_cub::synchronize_optional(policy),
+      "exclusive_scan_by_key failed to synchronize");
+  }
+
+  return result + num_items;
+}
+
+
+} // namespace detail
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+//---------------------------
+//   Inclusive scan
+//---------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class BinaryPred,
+          class ScanOp>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      BinaryPred                 binary_pred,
+                      ScanOp                     scan_op)
+{
+  ValOutputIt ret = value_result;
+  THRUST_CDP_DISPATCH(
+    (ret = thrust::cuda_cub::detail::inclusive_scan_by_key_n(
+       policy,
+       key_first,
+       value_first,
+       value_result,
+       thrust::distance(key_first, key_last),
+       binary_pred,
+       scan_op);),
+    (ret = thrust::inclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         binary_pred,
+                                         scan_op);));
+
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class BinaryPred>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      BinaryPred                 binary_pred)
+{
+  return cuda_cub::inclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         binary_pred,
+                                         thrust::plus<>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt>
+ValOutputIt __host__ __device__
+inclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result)
+{
+  return cuda_cub::inclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         thrust::equal_to<>());
+}
+
+
+//---------------------------
+//   Exclusive scan
+//---------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init,
+          class BinaryPred,
+          class ScanOp>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init,
+                      BinaryPred                 binary_pred,
+                      ScanOp                     scan_op)
+{
+  ValOutputIt ret = value_result;
+  THRUST_CDP_DISPATCH(
+    (ret = thrust::cuda_cub::detail::exclusive_scan_by_key_n(
+       policy,
+       key_first,
+       value_first,
+       value_result,
+       thrust::distance(key_first, key_last),
+       init,
+       binary_pred,
+       scan_op);),
+    (ret = thrust::exclusive_scan_by_key(cvt_to_seq(derived_cast(policy)),
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         binary_pred,
+                                         scan_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init,
+          class BinaryPred>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init,
+                      BinaryPred                 binary_pred)
+{
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         binary_pred,
+                                         thrust::plus<>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt,
+          class Init>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result,
+                      Init                       init)
+{
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         init,
+                                         thrust::equal_to<>());
+}
+
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class ValOutputIt>
+ValOutputIt __host__ __device__
+exclusive_scan_by_key(execution_policy<Derived> &policy,
+                      KeyInputIt                 key_first,
+                      KeyInputIt                 key_last,
+                      ValInputIt                 value_first,
+                      ValOutputIt                value_result)
+{
+  using value_type = typename thrust::iterator_traits<ValInputIt>::value_type;
+  return cuda_cub::exclusive_scan_by_key(policy,
+                                         key_first,
+                                         key_last,
+                                         value_first,
+                                         value_result,
+                                         value_type{});
+}
+
+
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+
+#include <thrust/scan.h>
 
+#endif // NVCC
diff --git a/thrust/system/cuda/detail/scatter.h b/thrust/system/cuda/detail/scatter.h
index c6ae90664..e297d782d 100644
--- a/thrust/system/cuda/detail/scatter.h
+++ b/thrust/system/cuda/detail/scatter.h
@@ -1,22 +1,106 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class ResultIt>
+void __host__ __device__
+scatter(execution_policy<Derived>& policy,
+        ItemsIt                    first,
+        ItemsIt                    last,
+        MapIt                      map,
+        ResultIt                   result)
+{
+  cuda_cub::transform(policy,
+                   first,
+                   last,
+                   thrust::make_permutation_iterator(result, map),
+                   identity());
+}
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class StencilIt,
+          class ResultIt,
+          class Predicate>
+void __host__ __device__
+scatter_if(execution_policy<Derived>& policy,
+           ItemsIt                    first,
+           ItemsIt                    last,
+           MapIt                      map,
+           StencilIt                  stencil,
+           ResultIt                   result,
+           Predicate                  predicate)
+{
+  cuda_cub::transform_if(policy,
+                      first,
+                      last,
+                      stencil,
+                      thrust::make_permutation_iterator(result, map),
+                      identity(),
+                      predicate);
+}
+
+template <class Derived,
+          class ItemsIt,
+          class MapIt,
+          class StencilIt,
+          class ResultIt,
+          class Predicate>
+void __host__ __device__
+scatter_if(execution_policy<Derived>& policy,
+           ItemsIt                    first,
+           ItemsIt                    last,
+           MapIt                      map,
+           StencilIt                  stencil,
+           ResultIt                   result)
+{
+  cuda_cub::scatter_if(policy,
+                    first,
+                    last,
+                    map,
+                    stencil,
+                    result,
+                    identity());
+}
+
 
+} // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/set_difference.inl b/thrust/system/cuda/detail/set_difference.inl
deleted file mode 100644
index fc1c4357f..000000000
--- a/thrust/system/cuda/detail/set_difference.inl
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/set_operations.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/seq.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_difference_detail
-{
-
-
-struct serial_bounded_set_difference
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        active_mask |= active_bit;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-        ++result;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result + last1 - first1;
-  }
-}; // end serial_bounded_set_difference
-
-
-} // end namespace set_difference_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
-                                     RandomAccessIterator1 first1,
-                                     RandomAccessIterator1 last1,
-                                     RandomAccessIterator2 first2,
-                                     RandomAccessIterator2 last2,
-                                     RandomAccessIterator3 result,
-                                     Compare comp)
-{
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator3 parallel_path(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-    {
-      return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_difference_detail::serial_bounded_set_difference());
-    }
-
-    __host__ __device__
-    static RandomAccessIterator3 sequential_path(execution_policy<DerivedPolicy> &,
-                                                 RandomAccessIterator1 first1,
-                                                 RandomAccessIterator1 last1,
-                                                 RandomAccessIterator2 first2,
-                                                 RandomAccessIterator2 last2,
-                                                 RandomAccessIterator3 result,
-                                                 Compare comp)
-    {
-      return thrust::set_difference(thrust::seq, first1, last1, first2, last2, result, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first1, last1, first2, last2, result, comp);
-#else
-  return workaround::sequential_path(exec, first1, last1, first2, last2, result, comp);
-#endif
-} // end set_difference
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/set_intersection.inl b/thrust/system/cuda/detail/set_intersection.inl
deleted file mode 100644
index 466b58376..000000000
--- a/thrust/system/cuda/detail/set_intersection.inl
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/set_operations.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/seq.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_intersection_detail
-{
-
-
-struct serial_bounded_set_intersection
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        *result = *first1;
-        ++first1;
-        ++first2;
-        active_mask |= active_bit;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++result;
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result;
-  }
-}; // end serial_bounded_set_intersection
-
-
-} // end namespace set_intersection_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
-                                       RandomAccessIterator1 first1,
-                                       RandomAccessIterator1 last1,
-                                       RandomAccessIterator2 first2,
-                                       RandomAccessIterator2 last2,
-                                       RandomAccessIterator3 result,
-                                       Compare comp)
-{
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator3 parallel_path(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-    {
-      return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_intersection_detail::serial_bounded_set_intersection());
-    }
-
-    __host__ __device__
-    static RandomAccessIterator3 sequential_path(execution_policy<DerivedPolicy> &,
-                                                 RandomAccessIterator1 first1,
-                                                 RandomAccessIterator1 last1,
-                                                 RandomAccessIterator2 first2,
-                                                 RandomAccessIterator2 last2,
-                                                 RandomAccessIterator3 result,
-                                                 Compare comp)
-    {
-      return thrust::set_intersection(thrust::seq, first1, last1, first2, last2, result, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first1, last1, first2, last2, result, comp);
-#else
-  return workaround::sequential_path(exec, first1, last1, first2, last2, result, comp);
-#endif
-} // end set_intersection
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/set_operations.h b/thrust/system/cuda/detail/set_operations.h
index 3f6eed5e6..98bb4bb5d 100644
--- a/thrust/system/cuda/detail/set_operations.h
+++ b/thrust/system/cuda/detail/set_operations.h
@@ -1,101 +1,1939 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/pair.h>
+#include <thrust/set_operations.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+namespace __set_operations {
+
+  template <bool UpperBound,
+            class IntT,
+            class Size,
+            class It,
+            class T,
+            class Comp>
+  THRUST_DEVICE_FUNCTION void
+  binary_search_iteration(It   data,
+                          Size &begin,
+                          Size &end,
+                          T    key,
+                          int  shift,
+                          Comp comp)
+  {
+
+    IntT scale = (1 << shift) - 1;
+    Size mid   = (begin + scale * end) >> shift;
+
+    T    key2 = data[mid];
+    bool pred = UpperBound ? !comp(key, key2) : comp(key2, key);
+    if (pred)
+      begin = mid + 1;
+    else
+      end = mid;
+  }
+
+  template <bool UpperBound, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  binary_search(It data, Size count, T key, Comp comp)
+  {
+    Size begin = 0;
+    Size end   = count;
+    while (begin < end)
+      binary_search_iteration<UpperBound, int>(data,
+                                               begin,
+                                               end,
+                                               key,
+                                               1,
+                                               comp);
+    return begin;
+  }
+
+  template <bool UpperBound, class IntT, class Size, class T, class It, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  biased_binary_search(It data, Size count, T key, IntT levels, Comp comp)
+  {
+    Size begin = 0;
+    Size end   = count;
+
+    if (levels >= 4 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 9, comp);
+    if (levels >= 3 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 7, comp);
+    if (levels >= 2 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 5, comp);
+    if (levels >= 1 && begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 4, comp);
+
+    while (begin < end)
+      binary_search_iteration<UpperBound, IntT>(data, begin, end, key, 1, comp);
+    return begin;
+  }
+
+  template <bool UpperBound, class Size, class It1, class It2, class Comp>
+  THRUST_DEVICE_FUNCTION Size
+  merge_path(It1 a, Size aCount, It2 b, Size bCount, Size diag, Comp comp)
+  {
+    typedef typename thrust::iterator_traits<It1>::value_type T;
+
+    Size begin = thrust::max<Size>(0, diag - bCount);
+    Size end   = thrust::min<Size>(diag, aCount);
+
+    while (begin < end)
+    {
+      Size  mid  = (begin + end) >> 1;
+      T    aKey = a[mid];
+      T    bKey = b[diag - 1 - mid];
+      bool pred = UpperBound ? comp(aKey, bKey) : !comp(bKey, aKey);
+      if (pred)
+        begin = mid + 1;
+      else
+        end = mid;
+    }
+    return begin;
+  }
+
+  template <class It1, class It2, class Size, class Size2, class CompareOp>
+  THRUST_DEVICE_FUNCTION pair<Size, Size>
+  balanced_path(It1       keys1,
+                It2       keys2,
+                Size      num_keys1,
+                Size      num_keys2,
+                Size      diag,
+                Size2     levels,
+                CompareOp compare_op)
+  {
+    typedef typename iterator_traits<It1>::value_type T;
+
+    Size index1 = merge_path<false>(keys1,
+                                    num_keys1,
+                                    keys2,
+                                    num_keys2,
+                                    diag,
+                                    compare_op);
+    Size index2 = diag - index1;
+
+    bool star = false;
+    if (index2 < num_keys2)
+    {
+      T x = keys2[index2];
+
+      // Search for the beginning of the duplicate run in both A and B.
+      Size start1 = biased_binary_search<false>(keys1,
+                                                index1,
+                                                x,
+                                                levels,
+                                                compare_op);
+      Size start2 = biased_binary_search<false>(keys2,
+                                                index2,
+                                                x,
+                                                levels,
+                                                compare_op);
+
+      // The distance between x's merge path and its lower_bound is its rank.
+      // We add up the a and b ranks and evenly distribute them to
+      // get a stairstep path.
+      Size run1      = index1 - start1;
+      Size run2      = index2 - start2;
+      Size total_run = run1 + run2;
+
+      // Attempt to advance b and regress a.
+      Size advance2 = max<Size>(total_run >> 1, total_run - run1);
+      Size end2     = min<Size>(num_keys2, start2 + advance2 + 1);
+
+      Size run_end2 = index2 + binary_search<true>(keys2 + index2,
+                                                   end2 - index2,
+                                                   x,
+                                                   compare_op);
+      run2 = run_end2 - start2;
+
+      advance2      = min<Size>(advance2, run2);
+      Size advance1 = total_run - advance2;
+
+      bool round_up      = (advance1 == advance2 + 1) && (advance2 < run2);
+      if (round_up) star = true;
+
+      index1 = start1 + advance1;
+    }
+    return thrust::make_pair(index1, (diag - index1) + star);
+  }    // func balanced_path
+
+  template <int                      _BLOCK_THREADS,
+            int                      _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm  _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier   _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm  _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD - 1
+    };
+
+    static const cub::BlockLoadAlgorithm  LOAD_ALGORITHM  = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier   LOAD_MODIFIER   = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm  SCAN_ALGORITHM  = _SCAN_ALGORITHM;
+  };    // PtxPolicy
+
+  template<class Arch, class T, class U>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, class U>
+  struct Tuning<sm30,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T),    // + sizeof(Value),
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES)>::value>::value,
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm30
+
+  template<class T, class U>
+  struct Tuning<sm52,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T), // + sizeof(U),
+      NOMINAL_4B_ITEMS_PER_THREAD = 15,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES)>::value>::value,
+    };
+
+    typedef PtxPolicy<256,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm52
+
+  template<class T, class U>
+  struct Tuning<sm60,T,U>
+  {
+    enum
+    {
+      MAX_INPUT_BYTES             = mpl::max<size_t, sizeof(T), sizeof(U)>::value,
+      COMBINED_INPUT_BYTES        = sizeof(T), // + sizeof(U),
+      NOMINAL_4B_ITEMS_PER_THREAD = 19,
+      ITEMS_PER_THREAD            = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<
+              int,
+              1,
+              static_cast<int>(((NOMINAL_4B_ITEMS_PER_THREAD * 4) +
+               COMBINED_INPUT_BYTES - 1) /
+                  COMBINED_INPUT_BYTES)>::value>::value,
+    };
+
+    typedef PtxPolicy<512,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  }; // tuning sm60
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class ValuesIt1,
+            class ValuesIt2,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class Size,
+            class CompareOp,
+            class SetOp,
+            class HAS_VALUES>
+  struct SetOpAgent
+  {
+    typedef typename iterator_traits<KeysIt1>::value_type  key1_type;
+    typedef typename iterator_traits<KeysIt2>::value_type  key2_type;
+    typedef typename iterator_traits<ValuesIt1>::value_type value1_type;
+    typedef typename iterator_traits<ValuesIt2>::value_type value2_type;
+
+    typedef key1_type  key_type;
+    typedef value1_type value_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type, value_type>::type
+    {
+      typedef Tuning<Arch, key_type, value_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeysIt1>::type   KeysLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, KeysIt2>::type   KeysLoadIt2;
+      typedef typename core::LoadIterator<PtxPlan, ValuesIt1>::type ValuesLoadIt1;
+      typedef typename core::LoadIterator<PtxPlan, ValuesIt2>::type ValuesLoadIt2;
+
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt1>::type   BlockLoadKeys1;
+      typedef typename core::BlockLoad<PtxPlan, KeysLoadIt2>::type   BlockLoadKeys2;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt1>::type BlockLoadValues1;
+      typedef typename core::BlockLoad<PtxPlan, ValuesLoadIt2>::type BlockLoadValues2;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      // gather required temporary storage in a union
+      //
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage          scan;
+          typename TilePrefixCallback::TempStorage prefix;
+        } scan_storage;
+
+        struct LoadStorage
+        {
+          core::uninitialized_array<int, PtxPlan::BLOCK_THREADS> offset;
+          union
+          {
+            // FIXME These don't appear to be used anywhere?
+            typename BlockLoadKeys1::TempStorage   load_keys1;
+            typename BlockLoadKeys2::TempStorage   load_keys2;
+            typename BlockLoadValues1::TempStorage load_values1;
+            typename BlockLoadValues2::TempStorage load_values2;
+
+            // Allocate extra shmem than truely neccessary
+            // This will permit to avoid range checks in
+            // serial set operations, e.g. serial_set_difference
+            core::uninitialized_array<
+                key_type,
+                PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS>
+                keys_shared;
+
+            core::uninitialized_array<
+                value_type,
+                PtxPlan::ITEMS_PER_TILE + PtxPlan::BLOCK_THREADS>
+                values_shared;
+          }; // anon union
+        } load_storage; // struct LoadStorage
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeysLoadIt1   KeysLoadIt1;
+    typedef typename ptx_plan::KeysLoadIt2   KeysLoadIt2;
+    typedef typename ptx_plan::ValuesLoadIt1 ValuesLoadIt1;
+    typedef typename ptx_plan::ValuesLoadIt2 ValuesLoadIt2;
+
+    typedef typename ptx_plan::BlockLoadKeys1   BlockLoadKeys1;
+    typedef typename ptx_plan::BlockLoadKeys2   BlockLoadKeys2;
+    typedef typename ptx_plan::BlockLoadValues1 BlockLoadValues1;
+    typedef typename ptx_plan::BlockLoadValues2 BlockLoadValues2;
+
+    typedef typename ptx_plan::TilePrefixCallback TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan BlockScan;
+
+    typedef typename ptx_plan::TempStorage TempStorage;
+
+    enum
+    {
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &  storage;
+      ScanTileState &tile_state;
+      KeysLoadIt1    keys1_in;
+      KeysLoadIt2    keys2_in;
+      ValuesLoadIt1  values1_in;
+      ValuesLoadIt2  values2_in;
+      Size           keys1_count;
+      Size           keys2_count;
+      KeysOutputIt   keys_out;
+      ValuesOutputIt values_out;
+      CompareOp      compare_op;
+      SetOp          set_op;
+      pair<Size, Size> *partitions;
+      std::size_t *output_count;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      template <bool IS_FULL_TILE, class T, class It1, class It2>
+      THRUST_DEVICE_FUNCTION void
+      gmem_to_reg(T (&output)[ITEMS_PER_THREAD],
+                  It1 input1,
+                  It2 input2,
+                  int count1,
+                  int count2)
+      {
+        if (IS_FULL_TILE)
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD - 1; ++ITEM)
+          {
+            int idx      = BLOCK_THREADS * ITEM + threadIdx.x;
+            output[ITEM] = (idx < count1)
+                               ? static_cast<T>(input1[idx])
+                               : static_cast<T>(input2[idx - count1]);
+          }
+
+          // last ITEM might be a conditional load even for full tiles
+          // please check first before attempting to load.
+          int ITEM = ITEMS_PER_THREAD - 1;
+          int idx  = BLOCK_THREADS * ITEM + threadIdx.x;
+          if (idx < count1 + count2)
+            output[ITEM] = (idx < count1)
+                               ? static_cast<T>(input1[idx])
+                               : static_cast<T>(input2[idx - count1]);
+        }
+        else
+        {
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+            if (idx < count1 + count2)
+            {
+              output[ITEM] = (idx < count1)
+                                 ? static_cast<T>(input1[idx])
+                                 : static_cast<T>(input2[idx - count1]);
+            }
+          }
+        }
+      }
+
+      template <class T, class It>
+      THRUST_DEVICE_FUNCTION void
+      reg_to_shared(It output,
+                    T (&input)[ITEMS_PER_THREAD])
+      {
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int idx = BLOCK_THREADS * ITEM + threadIdx.x;
+          output[idx] = input[ITEM];
+        }
+      }
+
+      template <class OutputIt, class T, class SharedIt>
+      void THRUST_DEVICE_FUNCTION
+      scatter(OutputIt output,
+              T (&input)[ITEMS_PER_THREAD],
+              SharedIt shared,
+              int      active_mask,
+              Size     thread_output_prefix,
+              Size     tile_output_prefix,
+              int      tile_output_count)
+      {
+        using core::sync_threadblock;
+
+
+
+        int local_scatter_idx = thread_output_prefix - tile_output_prefix;
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          if (active_mask & (1 << ITEM))
+          {
+            shared[local_scatter_idx++] = input[ITEM];
+          }
+        }
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < tile_output_count;
+             item += BLOCK_THREADS)
+        {
+          output[tile_output_prefix + item] = shared[item];
+        }
+      }
+
+      int THRUST_DEVICE_FUNCTION
+      serial_set_op(key_type *keys,
+                    int       keys1_beg,
+                    int       keys2_beg,
+                    int       keys1_count,
+                    int       keys2_count,
+                    key_type (&output)[ITEMS_PER_THREAD],
+                    int (&indices)[ITEMS_PER_THREAD],
+                    CompareOp compare_op,
+                    SetOp     set_op)
+      {
+        int active_mask = set_op(keys,
+                                 keys1_beg,
+                                 keys2_beg,
+                                 keys1_count,
+                                 keys2_count,
+                                 output,
+                                 indices,
+                                 compare_op);
+
+        return active_mask;
+      }
+
+      //---------------------------------------------------------------------
+      // Tile operations
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE>
+      void THRUST_DEVICE_FUNCTION
+      consume_tile(Size tile_idx)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        pair<Size, Size> partition_beg = partitions[tile_idx + 0];
+        pair<Size, Size> partition_end = partitions[tile_idx + 1];
+
+        Size keys1_beg = partition_beg.first;
+        Size keys1_end = partition_end.first;
+        Size keys2_beg = partition_beg.second;
+        Size keys2_end = partition_end.second;
+
+        // number of keys per tile
+        //
+        int num_keys1 = static_cast<int>(keys1_end - keys1_beg);
+        int num_keys2 = static_cast<int>(keys2_end - keys2_beg);
+
+
+       // load keys into shared memory for further processing
+        key_type keys_loc[ITEMS_PER_THREAD];
+
+        gmem_to_reg<!IS_LAST_TILE>(keys_loc,
+                                   keys1_in + keys1_beg,
+                                   keys2_in + keys2_beg,
+                                   num_keys1,
+                                   num_keys2);
+
+        reg_to_shared(&storage.load_storage.keys_shared[0], keys_loc);
+
+        sync_threadblock();
+
+        int diag_loc = min<int>(ITEMS_PER_THREAD * threadIdx.x,
+                                num_keys1 + num_keys2);
+
+        pair<int, int> partition_loc =
+            balanced_path(&storage.load_storage.keys_shared[0],
+                          &storage.load_storage.keys_shared[num_keys1],
+                          num_keys1,
+                          num_keys2,
+                          diag_loc,
+                          4,
+                          compare_op);
+
+        int keys1_beg_loc = partition_loc.first;
+        int keys2_beg_loc = partition_loc.second;
+
+        // compute difference between next and current thread
+        // to obtain number of elements per thread
+        int value = threadIdx.x == 0
+                        ? (num_keys1 << 16) | num_keys2
+                        : (partition_loc.first << 16) | partition_loc.second;
+
+        int dst = threadIdx.x == 0 ? BLOCK_THREADS - 1 : threadIdx.x - 1;
+        storage.load_storage.offset[dst] = value;
+
+        core::sync_threadblock();
+
+        pair<int,int> partition1_loc = thrust::make_pair(
+          storage.load_storage.offset[threadIdx.x] >> 16,
+          storage.load_storage.offset[threadIdx.x] & 0xFFFF);
+
+        int keys1_end_loc = partition1_loc.first;
+        int keys2_end_loc = partition1_loc.second;
+
+        int num_keys1_loc = keys1_end_loc - keys1_beg_loc;
+        int num_keys2_loc = keys2_end_loc - keys2_beg_loc;
+
+        // perform serial set operation
+        //
+        int indices[ITEMS_PER_THREAD];
+
+        int active_mask = serial_set_op(&storage.load_storage.keys_shared[0],
+                                        keys1_beg_loc,
+                                        keys2_beg_loc + num_keys1,
+                                        num_keys1_loc,
+                                        num_keys2_loc,
+                                        keys_loc,
+                                        indices,
+                                        compare_op,
+                                        set_op);
+        sync_threadblock();
+#if 0
+        if (ITEMS_PER_THREAD*threadIdx.x >= num_keys1 + num_keys2)
+          active_mask = 0;
+#endif
+
+        // look-back scan over thread_output_count
+        // to compute global thread_output_base and tile_otput_count;
+        Size tile_output_count    = 0;
+        Size thread_output_prefix = 0;
+        Size tile_output_prefix   = 0;
+        Size thread_output_count = static_cast<Size>(__popc(active_mask));
+
+        if (tile_idx == 0)    // first tile
+        {
+          BlockScan(storage.scan_storage.scan)
+              .ExclusiveSum(thread_output_count,
+                            thread_output_prefix,
+                            tile_output_count);
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+            {
+              tile_state.SetInclusive(0, tile_output_count);
+            }
+          }
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       storage.scan_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+
+          BlockScan(storage.scan_storage.scan)
+              .ExclusiveSum(thread_output_count,
+                            thread_output_prefix,
+                            prefix_cb);
+          tile_output_count  = prefix_cb.GetBlockAggregate();
+          tile_output_prefix = prefix_cb.GetExclusivePrefix();
+        }
+
+        sync_threadblock();
+
+        // scatter results
+        //
+        scatter(keys_out,
+                keys_loc,
+                &storage.load_storage.keys_shared[0],
+                active_mask,
+                thread_output_prefix,
+                tile_output_prefix,
+                tile_output_count);
+
+        if (HAS_VALUES::value)
+        {
+          value_type values_loc[ITEMS_PER_THREAD];
+          gmem_to_reg<!IS_LAST_TILE>(values_loc,
+                                     values1_in + keys1_beg,
+                                     values2_in + keys2_beg,
+                                     num_keys1,
+                                     num_keys2);
+
+          sync_threadblock();
+
+          reg_to_shared(&storage.load_storage.values_shared[0], values_loc);
+
+          sync_threadblock();
+
+          // gather items from shared mem
+          //
+#pragma unroll
+          for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+          {
+            if (active_mask & (1 << ITEM))
+            {
+              values_loc[ITEM] = storage.load_storage.values_shared[indices[ITEM]];
+            }
+          }
+
+          sync_threadblock();
+
+          scatter(values_out,
+                  values_loc,
+                  &storage.load_storage.values_shared[0],
+                  active_mask,
+                  thread_output_prefix,
+                  tile_output_prefix,
+                  tile_output_count);
+        }
+
+        if (IS_LAST_TILE && threadIdx.x == 0)
+        {
+          *output_count = tile_output_prefix + tile_output_count;
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &  storage_,
+           ScanTileState &tile_state_,
+           KeysIt1        keys1_,
+           KeysIt2        keys2_,
+           ValuesIt1      values1_,
+           ValuesIt2      values2_,
+           Size           keys1_count_,
+           Size           keys2_count_,
+           KeysOutputIt   keys_out_,
+           ValuesOutputIt values_out_,
+           CompareOp      compare_op_,
+           SetOp          set_op_,
+           pair<Size, Size> *partitions_,
+           std::size_t * output_count_)
+          : storage(storage_),
+            tile_state(tile_state_),
+            keys1_in(core::make_load_iterator(ptx_plan(), keys1_)),
+            keys2_in(core::make_load_iterator(ptx_plan(), keys2_)),
+            values1_in(core::make_load_iterator(ptx_plan(), values1_)),
+            values2_in(core::make_load_iterator(ptx_plan(), values2_)),
+            keys1_count(keys1_count_),
+            keys2_count(keys2_count_),
+            keys_out(keys_out_),
+            values_out(values_out_),
+            compare_op(compare_op_),
+            set_op(set_op_),
+            partitions(partitions_),
+            output_count(output_count_)
+      {
+        int  tile_idx      = blockIdx.x;
+        int  num_tiles     = gridDim.x;
+
+        if (tile_idx < num_tiles-1)
+        {
+          consume_tile<false>(tile_idx);
+        }
+        else
+        {
+          consume_tile<true>(tile_idx);
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1        keys1,
+                       KeysIt2        keys2,
+                       ValuesIt1      values1,
+                       ValuesIt2      values2,
+                       Size           keys1_count,
+                       Size           keys2_count,
+                       KeysOutputIt   keys_output,
+                       ValuesOutputIt values_output,
+                       CompareOp      compare_op,
+                       SetOp          set_op,
+                       pair<Size, Size> *partitions,
+                       std::size_t *  output_count,
+                       ScanTileState tile_state,
+                       char *        shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           keys1,
+           keys2,
+           values1,
+           values2,
+           keys1_count,
+           keys2_count,
+           keys_output,
+           values_output,
+           compare_op,
+           set_op,
+           partitions,
+           output_count);
+    }
+  };    // struct SetOpAgent
+
+  template <class KeysIt1,
+            class KeysIt2,
+            class Size,
+            class CompareOp>
+  struct PartitionAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<256> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeysIt1 keys1,
+                       KeysIt2 keys2,
+                       Size    keys1_count,
+                       Size    keys2_count,
+                       Size    num_partitions,
+                       pair<Size, Size> *partitions,
+                       CompareOp compare_op,
+                       int       items_per_tile,
+                       char * /*shmem*/)
+    {
+      Size partition_idx = blockDim.x * blockIdx.x + threadIdx.x;
+      if (partition_idx < num_partitions)
+      {
+        Size partition_at = min<Size>(partition_idx * items_per_tile,
+                                      keys1_count + keys2_count);
+        pair<Size, Size> diag = balanced_path(keys1,
+                                              keys2,
+                                              keys1_count,
+                                              keys2_count,
+                                              partition_at,
+                                              4ll,
+                                              compare_op);
+        partitions[partition_idx] = diag;
+      }
+    }
+  };    // struct PartitionAgent
+
+  template <class ScanTileState,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_difference(execution_policy<DerivedPolicy> &exec,
-                                     RandomAccessIterator1 first1,
-                                     RandomAccessIterator1 last1,
-                                     RandomAccessIterator2 first2,
-                                     RandomAccessIterator2 last2,
-                                     RandomAccessIterator3 result,
-                                     Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_intersection(execution_policy<DerivedPolicy> &exec,
-                                       RandomAccessIterator1 first1,
-                                       RandomAccessIterator1 last1,
-                                       RandomAccessIterator2 first2,
-                                       RandomAccessIterator2 last2,
-                                       RandomAccessIterator3 result,
-                                       Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp);
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_union(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator1 first1,
-                                RandomAccessIterator1 last1,
-                                RandomAccessIterator2 first2,
-                                RandomAccessIterator2 last2,
-                                RandomAccessIterator3 result,
-                                Compare comp);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/set_difference.inl>
-#include <thrust/system/cuda/detail/set_intersection.inl>
-#include <thrust/system/cuda/detail/set_symmetric_difference.inl>
-#include <thrust/system/cuda/detail/set_union.inl>
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+    }
+  }; // struct InitAgent
+
+  //---------------------------------------------------------------------
+  // Serial set operations
+  //---------------------------------------------------------------------
+
+  // serial_set_intersection
+  // -----------------------
+  // emit A if A and B are in range and equal.
+  struct serial_set_intersection
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pA = compare_op(aKey, bKey);
+        bool pB = compare_op(bKey, aKey);
+
+        // The outputs must come from A by definition of set interection.
+        output[i]  = aKey;
+        indices[i] = aBegin;
+
+        if ((aBegin < aEnd) && (bBegin < bEnd) && pA == pB)
+          active_mask |= 1 << i;
+
+        if (!pB) {aKey = keys[++aBegin]; }
+        if (!pA) {bKey = keys[++bBegin]; }
+      }
+      return active_mask;
+    }
+  };    // struct serial_set_intersection
+
+  // serial_set_symmetric_difference
+  // ---------------------
+  // emit A if A < B and emit B if B < A.
+  struct serial_set_symmetric_difference
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // The outputs must come from A by definition of set difference.
+        output[i]  = pA ? aKey : bKey;
+        indices[i] = pA ? aBegin : bBegin;
+
+        if (aBegin + bBegin < end && pA != pB)
+          active_mask |= 1 << i;
+
+        if (!pB) {aKey = keys[++aBegin]; }
+        if (!pA) {bKey = keys[++bBegin]; }
+
+      }
+      return active_mask;
+    }
+  };    // struct set_symmetric_difference
+
+  // serial_set_difference
+  // ---------------------
+  // emit A if A < B
+  struct serial_set_difference
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // The outputs must come from A by definition of set difference.
+        output[i]  = aKey;
+        indices[i] = aBegin;
+
+        if (aBegin + bBegin < end && pA)
+          active_mask |= 1 << i;
+
+        if (!pB) { aKey = keys[++aBegin]; }
+        if (!pA) { bKey = keys[++bBegin]; }
+      }
+      return active_mask;
+    }
+  };    // struct set_difference
+
+  // serial_set_union
+  // ----------------
+  // emit A if A <= B else emit B
+  struct serial_set_union
+  {
+    // max_input_size <= 32
+    template <class T, class CompareOp, int ITEMS_PER_THREAD>
+    int THRUST_DEVICE_FUNCTION
+    operator()(T * keys,
+               int keys1_beg,
+               int keys2_beg,
+               int keys1_count,
+               int keys2_count,
+               T (&output)[ITEMS_PER_THREAD],
+               int (&indices)[ITEMS_PER_THREAD],
+               CompareOp compare_op)
+    {
+      int active_mask = 0;
+
+      int aBegin = keys1_beg;
+      int bBegin = keys2_beg;
+      int aEnd   = keys1_beg + keys1_count;
+      int bEnd   = keys2_beg + keys2_count;
+      int end    = aEnd + bEnd;
+
+      T aKey = keys[aBegin];
+      T bKey = keys[bBegin];
+
+#pragma unroll
+      for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+      {
+        bool pB = aBegin >= aEnd;
+        bool pA = !pB && bBegin >= bEnd;
+
+        if (!pA && !pB)
+        {
+          pA = compare_op(aKey, bKey);
+          pB = !pA && compare_op(bKey, aKey);
+        }
+
+        // Output A in case of a tie, so check if b < a.
+        output[i]  = pB ? bKey : aKey;
+        indices[i] = pB ? bBegin : aBegin;
+
+        if (aBegin + bBegin < end)
+          active_mask |= 1 << i;
+
+        if (!pB) { aKey = keys[++aBegin]; }
+        if (!pA) { bKey = keys[++bBegin]; }
+
+      }
+      return active_mask;
+    }
+  };    // struct set_union
+
+  template <class HAS_VALUES,
+            class KeysIt1,
+            class KeysIt2,
+            class ValuesIt1,
+            class ValuesIt2,
+            class Size,
+            class KeysOutputIt,
+            class ValuesOutputIt,
+            class CompareOp,
+            class SetOp>
+  cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *         d_temp_storage,
+            size_t &       temp_storage_size,
+            KeysIt1        keys1,
+            KeysIt2        keys2,
+            ValuesIt1      values1,
+            ValuesIt2      values2,
+            Size           num_keys1,
+            Size           num_keys2,
+            KeysOutputIt   keys_output,
+            ValuesOutputIt values_output,
+            std::size_t *  output_count,
+            CompareOp      compare_op,
+            SetOp          set_op,
+            cudaStream_t   stream)
+  {
+    Size keys_total = num_keys1 + num_keys2;
+    if (keys_total == 0)
+      return cudaErrorNotSupported;
+
+    cudaError_t status = cudaSuccess;
+
+    using core::AgentPlan;
+    using core::AgentLauncher;
+
+    typedef AgentLauncher<
+        SetOpAgent<KeysIt1,
+                   KeysIt2,
+                   ValuesIt1,
+                   ValuesIt2,
+                   KeysOutputIt,
+                   ValuesOutputIt,
+                   Size,
+                   CompareOp,
+                   SetOp,
+                   HAS_VALUES> >
+        set_op_agent;
+
+    typedef AgentLauncher<PartitionAgent<KeysIt1, KeysIt2, Size, CompareOp> >
+        partition_agent;
+
+    typedef typename set_op_agent::ScanTileState ScanTileState;
+    typedef AgentLauncher<InitAgent<ScanTileState, Size> > init_agent;
+
+
+    AgentPlan set_op_plan    = set_op_agent::get_plan(stream);
+    AgentPlan init_plan      = init_agent::get_plan();
+    AgentPlan partition_plan = partition_agent::get_plan();
+
+    int  tile_size = set_op_plan.items_per_tile;
+    Size num_tiles = (keys_total + tile_size - 1) / tile_size;
+
+    size_t tile_agent_storage;
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles),
+                                           tile_agent_storage);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    size_t vshmem_storage = core::vshmem_size(set_op_plan.shared_memory_size,
+                                              num_tiles);
+    size_t partition_agent_storage = (num_tiles + 1) * sizeof(Size) * 2;
+
+    void *allocations[3] = {NULL, NULL, NULL};
+    size_t allocation_sizes[3] = {tile_agent_storage,
+                                  partition_agent_storage,
+                                  vshmem_storage};
+
+    status = core::alias_storage(d_temp_storage,
+                                 temp_storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_state;
+    status = tile_state.Init(static_cast<int>(num_tiles),
+                             allocations[0],
+                             allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    pair<Size, Size> *partitions = (pair<Size, Size> *)allocations[1];
+    char *vshmem_ptr = vshmem_storage > 0 ? (char *)allocations[2] : NULL;
+
+    init_agent ia(init_plan, num_tiles, stream, "set_op::init_agent");
+    ia.launch(tile_state, num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    partition_agent pa(partition_plan, num_tiles+1, stream, "set_op::partition agent");
+    pa.launch(keys1,
+              keys2,
+              num_keys1,
+              num_keys2,
+              num_tiles+1,
+              partitions,
+              compare_op,
+              tile_size);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    set_op_agent sa(set_op_plan, keys_total, stream, vshmem_ptr, "set_op::set_op_agent");
+    sa.launch(keys1,
+              keys2,
+              values1,
+              values2,
+              num_keys1,
+              num_keys2,
+              keys_output,
+              values_output,
+              compare_op,
+              set_op,
+              partitions,
+              output_count,
+              tile_state);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    return status;
+ }
+
+ template <typename HAS_VALUES,
+           typename Derived,
+           typename KeysIt1,
+           typename KeysIt2,
+           typename ValuesIt1,
+           typename ValuesIt2,
+           typename KeysOutputIt,
+           typename ValuesOutputIt,
+           typename CompareOp,
+           typename SetOp>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeysOutputIt, ValuesOutputIt>
+  set_operations(execution_policy<Derived>& policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ValuesIt1                  values1_first,
+                 ValuesIt2                  values2_first,
+                 KeysOutputIt               keys_output,
+                 ValuesOutputIt             values_output,
+                 CompareOp                  compare_op,
+                 SetOp                      set_op)
+  {
+    typedef typename iterator_traits<KeysIt1>::difference_type size_type;
+
+    size_type num_keys1 = static_cast<size_type>(thrust::distance(keys1_first, keys1_last));
+    size_type num_keys2 = static_cast<size_type>(thrust::distance(keys2_first, keys2_last));
+
+    if (num_keys1 + num_keys2 == 0)
+      return thrust::make_pair(keys_output, values_output);
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (NULL,
+                                   temp_storage_bytes,
+                                   keys1_first,
+                                   keys2_first,
+                                   values1_first,
+                                   values2_first,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
+                                   keys_output,
+                                   values_output,
+                                   reinterpret_cast<std::size_t*>(NULL),
+                                   compare_op,
+                                   set_op,
+                                   stream));
+    cuda_cub::throw_on_error(status, "set_operations failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(std::size_t), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "set_operations failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "set_operations failed on 2nd alias_storage");
+
+    std::size_t* d_output_count
+      = thrust::detail::aligned_reinterpret_cast<std::size_t*>(allocations[0]);
+
+    THRUST_DOUBLE_INDEX_TYPE_DISPATCH(status, doit_step<HAS_VALUES>,
+        num_keys1, num_keys2, (allocations[1],
+                                   temp_storage_bytes,
+                                   keys1_first,
+                                   keys2_first,
+                                   values1_first,
+                                   values2_first,
+                                   num_keys1_fixed,
+                                   num_keys2_fixed,
+                                   keys_output,
+                                   values_output,
+                                   d_output_count,
+                                   compare_op,
+                                   set_op,
+                                   stream));
+    cuda_cub::throw_on_error(status, "set_operations failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "set_operations failed to synchronize");
+
+    std::size_t output_count = cuda_cub::get_value(policy, d_output_count);
+
+    return thrust::make_pair(keys_output + output_count, values_output + output_count);
+  }
+}    // namespace __set_operations
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_difference(execution_policy<Derived> &policy,
+               ItemsIt1                   items1_first,
+               ItemsIt1                   items1_last,
+               ItemsIt2                   items2_first,
+               ItemsIt2                   items2_last,
+               OutputIt                   result,
+               CompareOp                  compare)
+{
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_difference());
+     result = tmp.first;),
+    (result = thrust::set_difference(cvt_to_seq(derived_cast(policy)),
+                                     items1_first,
+                                     items1_last,
+                                     items2_first,
+                                     items2_last,
+                                     result,
+                                     compare);));
+  return result;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_difference(execution_policy<Derived> &policy,
+               ItemsIt1                   items1_first,
+               ItemsIt1                   items1_last,
+               ItemsIt2                   items2_first,
+               ItemsIt2                   items2_last,
+               OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_difference(policy,
+                                  items1_first,
+                                  items1_last,
+                                  items2_first,
+                                  items2_last,
+                                  result,
+                                  less<value_type>());
+}
+
+/*****************************/
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_intersection(execution_policy<Derived> &policy,
+                 ItemsIt1                   items1_first,
+                 ItemsIt1                   items1_last,
+                 ItemsIt2                   items2_first,
+                 ItemsIt2                   items2_last,
+                 OutputIt                   result,
+                 CompareOp                  compare)
+{
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = NULL;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_intersection());
+     result = tmp.first;),
+    (result = thrust::set_intersection(cvt_to_seq(derived_cast(policy)),
+                                       items1_first,
+                                       items1_last,
+                                       items2_first,
+                                       items2_last,
+                                       result,
+                                       compare);));
+  return result;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_intersection(execution_policy<Derived> &policy,
+                 ItemsIt1                   items1_first,
+                 ItemsIt1                   items1_last,
+                 ItemsIt2                   items2_first,
+                 ItemsIt2                   items2_last,
+                 OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_intersection(policy,
+                                    items1_first,
+                                    items1_last,
+                                    items2_first,
+                                    items2_last,
+                                    result,
+                                    less<value_type>());
+}
+
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_symmetric_difference(execution_policy<Derived> &policy,
+                         ItemsIt1                   items1_first,
+                         ItemsIt1                   items1_last,
+                         ItemsIt2                   items2_first,
+                         ItemsIt2                   items2_last,
+                         OutputIt                   result,
+                         CompareOp                  compare)
+{
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_symmetric_difference());
+     result = tmp.first;),
+    (result = thrust::set_symmetric_difference(cvt_to_seq(derived_cast(policy)),
+                                               items1_first,
+                                               items1_last,
+                                               items2_first,
+                                               items2_last,
+                                               result,
+                                               compare);));
+  return result;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_symmetric_difference(execution_policy<Derived> &policy,
+                         ItemsIt1                   items1_first,
+                         ItemsIt1                   items1_last,
+                         ItemsIt2                   items2_first,
+                         ItemsIt2                   items2_last,
+                         OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_symmetric_difference(policy,
+                                            items1_first,
+                                            items1_last,
+                                            items2_first,
+                                            items2_last,
+                                            result,
+                                            less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt,
+          class CompareOp>
+OutputIt __host__ __device__
+set_union(execution_policy<Derived> &policy,
+          ItemsIt1                   items1_first,
+          ItemsIt1                   items1_last,
+          ItemsIt2                   items2_first,
+          ItemsIt2                   items2_last,
+          OutputIt                   result,
+          CompareOp                  compare)
+{
+  THRUST_CDP_DISPATCH(
+    (using items1_t  = thrust::iterator_value_t<ItemsIt1>;
+     items1_t *null_ = nullptr;
+     auto tmp = __set_operations::set_operations<thrust::detail::false_type>(
+       policy,
+       items1_first,
+       items1_last,
+       items2_first,
+       items2_last,
+       null_,
+       null_,
+       result,
+       null_,
+       compare,
+       __set_operations::serial_set_union());
+     result = tmp.first;),
+    (result = thrust::set_union(cvt_to_seq(derived_cast(policy)),
+                                items1_first,
+                                items1_last,
+                                items2_first,
+                                items2_last,
+                                result,
+                                compare);));
+  return result;
+}
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2,
+          class OutputIt>
+OutputIt __host__ __device__
+set_union(execution_policy<Derived> &policy,
+          ItemsIt1                   items1_first,
+          ItemsIt1                   items1_last,
+          ItemsIt2                   items2_first,
+          ItemsIt2                   items2_last,
+          OutputIt                   result)
+{
+  typedef typename thrust::iterator_value<ItemsIt1>::type value_type;
+  return cuda_cub::set_union(policy,
+                             items1_first,
+                             items1_last,
+                             items2_first,
+                             items2_last,
+                             result,
+                             less<value_type>());
+}
+
+
+/*****************************/
+/*****************************/
+/*****     *_by_key      *****/
+/*****************************/
+/*****************************/
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_difference_by_key(execution_policy<Derived> &policy,
+                      KeysIt1                    keys1_first,
+                      KeysIt1                    keys1_last,
+                      KeysIt2                    keys2_first,
+                      KeysIt2                    keys2_last,
+                      ItemsIt1                   items1_first,
+                      ItemsIt2                   items2_first,
+                      KeysOutputIt               keys_result,
+                      ItemsOutputIt              items_result,
+                      CompareOp                  compare_op)
+{
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_difference());),
+    (ret = thrust::set_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                         keys1_first,
+                                         keys1_last,
+                                         keys2_first,
+                                         keys2_last,
+                                         items1_first,
+                                         items2_first,
+                                         keys_result,
+                                         items_result,
+                                         compare_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_difference_by_key(execution_policy<Derived> &policy,
+                      KeysIt1                    keys1_first,
+                      KeysIt1                    keys1_last,
+                      KeysIt2                    keys2_first,
+                      KeysIt2                    keys2_last,
+                      ItemsIt1                   items1_first,
+                      ItemsIt2                   items2_first,
+                      KeysOutputIt               keys_result,
+                      ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_difference_by_key(policy,
+                                         keys1_first,
+                                         keys1_last,
+                                         keys2_first,
+                                         keys2_last,
+                                         items1_first,
+                                         items2_first,
+                                         keys_result,
+                                         items_result,
+                                         less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_intersection_by_key(execution_policy<Derived> &policy,
+                        KeysIt1                    keys1_first,
+                        KeysIt1                    keys1_last,
+                        KeysIt2                    keys2_first,
+                        KeysIt2                    keys2_last,
+                        ItemsIt1                   items1_first,
+                        KeysOutputIt               keys_result,
+                        ItemsOutputIt              items_result,
+                        CompareOp                  compare_op)
+{
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items1_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_intersection());),
+    (ret = thrust::set_intersection_by_key(cvt_to_seq(derived_cast(policy)),
+                                           keys1_first,
+                                           keys1_last,
+                                           keys2_first,
+                                           keys2_last,
+                                           items1_first,
+                                           keys_result,
+                                           items_result,
+                                           compare_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_intersection_by_key(execution_policy<Derived> &policy,
+                        KeysIt1                    keys1_first,
+                        KeysIt1                    keys1_last,
+                        KeysIt2                    keys2_first,
+                        KeysIt2                    keys2_last,
+                        ItemsIt1                   items1_first,
+                        KeysOutputIt               keys_result,
+                        ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_intersection_by_key(policy,
+                                           keys1_first,
+                                           keys1_last,
+                                           keys2_first,
+                                           keys2_last,
+                                           items1_first,
+                                           keys_result,
+                                           items_result,
+                                           less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_symmetric_difference_by_key(execution_policy<Derived> &policy,
+                                KeysIt1                    keys1_first,
+                                KeysIt1                    keys1_last,
+                                KeysIt2                    keys2_first,
+                                KeysIt2                    keys2_last,
+                                ItemsIt1                   items1_first,
+                                ItemsIt2                   items2_first,
+                                KeysOutputIt               keys_result,
+                                ItemsOutputIt              items_result,
+                                CompareOp                  compare_op)
+{
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_symmetric_difference());),
+    (ret =
+       thrust::set_symmetric_difference_by_key(cvt_to_seq(derived_cast(policy)),
+                                               keys1_first,
+                                               keys1_last,
+                                               keys2_first,
+                                               keys2_last,
+                                               items1_first,
+                                               items2_first,
+                                               keys_result,
+                                               items_result,
+                                               compare_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_symmetric_difference_by_key(execution_policy<Derived> &policy,
+                                KeysIt1                    keys1_first,
+                                KeysIt1                    keys1_last,
+                                KeysIt2                    keys2_first,
+                                KeysIt2                    keys2_last,
+                                ItemsIt1                   items1_first,
+                                ItemsIt2                   items2_first,
+                                KeysOutputIt               keys_result,
+                                ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_symmetric_difference_by_key(policy,
+                                                   keys1_first,
+                                                   keys1_last,
+                                                   keys2_first,
+                                                   keys2_last,
+                                                   items1_first,
+                                                   items2_first,
+                                                   keys_result,
+                                                   items_result,
+                                                   less<value_type>());
+}
+
+/*****************************/
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt,
+          class CompareOp>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_union_by_key(execution_policy<Derived> &policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ItemsIt1                   items1_first,
+                 ItemsIt2                   items2_first,
+                 KeysOutputIt               keys_result,
+                 ItemsOutputIt              items_result,
+                 CompareOp                  compare_op)
+{
+  auto ret = thrust::make_pair(keys_result, items_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __set_operations::set_operations<thrust::detail::true_type>(
+       policy,
+       keys1_first,
+       keys1_last,
+       keys2_first,
+       keys2_last,
+       items1_first,
+       items2_first,
+       keys_result,
+       items_result,
+       compare_op,
+       __set_operations::serial_set_union());),
+    (ret = thrust::set_union_by_key(cvt_to_seq(derived_cast(policy)),
+                                    keys1_first,
+                                    keys1_last,
+                                    keys2_first,
+                                    keys2_last,
+                                    items1_first,
+                                    items2_first,
+                                    keys_result,
+                                    items_result,
+                                    compare_op);));
+  return ret;
+}
+
+template <class Derived,
+          class KeysIt1,
+          class KeysIt2,
+          class ItemsIt1,
+          class ItemsIt2,
+          class KeysOutputIt,
+          class ItemsOutputIt>
+pair<KeysOutputIt, ItemsOutputIt> __host__ __device__
+set_union_by_key(execution_policy<Derived> &policy,
+                 KeysIt1                    keys1_first,
+                 KeysIt1                    keys1_last,
+                 KeysIt2                    keys2_first,
+                 KeysIt2                    keys2_last,
+                 ItemsIt1                   items1_first,
+                 ItemsIt2                   items2_first,
+                 KeysOutputIt               keys_result,
+                 ItemsOutputIt              items_result)
+{
+  typedef typename thrust::iterator_value<KeysIt1>::type value_type;
+  return cuda_cub::set_union_by_key(policy,
+                                    keys1_first,
+                                    keys1_last,
+                                    keys2_first,
+                                    keys2_last,
+                                    items1_first,
+                                    items2_first,
+                                    keys_result,
+                                    items_result,
+                                    less<value_type>());
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/set_symmetric_difference.inl b/thrust/system/cuda/detail/set_symmetric_difference.inl
deleted file mode 100644
index acd52cddf..000000000
--- a/thrust/system/cuda/detail/set_symmetric_difference.inl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/set_operations.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/seq.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_symmetric_difference_detail
-{
-
-
-struct serial_bounded_set_symmetric_difference
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        active_mask |= active_bit;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        *result = *first2;
-        active_mask |= active_bit;
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-
-    while(first2 != last2)
-    {
-      *result = *first2;
-      ++first2;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-        ++result;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-        ++result;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-    } // end while
-  
-    return result + thrust::max(last1 - first1,last2 - first2);
-  }
-}; // end serial_bounded_set_symmetric_difference
-
-
-} // end namespace set_symmetric_difference_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_symmetric_difference(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-{
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator3 parallel_path(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-    {
-      return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_symmetric_difference_detail::serial_bounded_set_symmetric_difference());
-    }
-
-    __host__ __device__
-    static RandomAccessIterator3 sequential_path(execution_policy<DerivedPolicy> &,
-                                                 RandomAccessIterator1 first1,
-                                                 RandomAccessIterator1 last1,
-                                                 RandomAccessIterator2 first2,
-                                                 RandomAccessIterator2 last2,
-                                                 RandomAccessIterator3 result,
-                                                 Compare comp)
-    {
-      return thrust::set_symmetric_difference(thrust::seq, first1, last1, first2, last2, result, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first1, last1, first2, last2, result, comp);
-#else
-  return workaround::sequential_path(exec, first1, last1, first2, last2, result, comp);
-#endif
-} // end set_symmetric_difference
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/set_union.inl b/thrust/system/cuda/detail/set_union.inl
deleted file mode 100644
index 1de2238dd..000000000
--- a/thrust/system/cuda/detail/set_union.inl
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/set_operations.h>
-#include <thrust/detail/cstdint.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/detail/set_operation.h>
-#include <thrust/system/cuda/detail/bulk.h>
-#include <thrust/detail/seq.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace set_union_detail
-{
-
-
-struct serial_bounded_set_union
-{
-  // max_input_size <= 32
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename OutputIterator, typename Compare>
-  inline __device__
-    thrust::detail::uint32_t operator()(Size max_input_size,
-                                        InputIterator1 first1, InputIterator1 last1,
-                                        InputIterator2 first2, InputIterator2 last2,
-                                        OutputIterator result,
-                                        Compare comp)
-  {
-    thrust::detail::uint32_t active_mask = 0;
-    thrust::detail::uint32_t active_bit = 1;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        *result = *first1;
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        *result = *first2;
-        ++first2;
-      } // end else if
-      else
-      {
-        *result = *first1;
-        ++first1;
-        ++first2;
-      } // end else
-  
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    } // end while
-
-    while(first1 != last1)
-    {
-      *result = *first1;
-      ++first1;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-
-    while(first2 != last2)
-    {
-      *result = *first2;
-      ++first2;
-      ++result;
-      active_mask |= active_bit;
-      active_bit <<= 1;
-    }
-  
-    return active_mask;
-  }
-
-
-  template<typename Size, typename InputIterator1, typename InputIterator2, typename Compare>
-  inline __device__
-    Size count(Size max_input_size,
-               InputIterator1 first1, InputIterator1 last1,
-               InputIterator2 first2, InputIterator2 last2,
-               Compare comp)
-  {
-    Size result = 0;
-  
-    while(first1 != last1 && first2 != last2)
-    {
-      if(comp(*first1,*first2))
-      {
-        ++first1;
-      } // end if
-      else if(comp(*first2,*first1))
-      {
-        ++first2;
-      } // end else if
-      else
-      {
-        ++first1;
-        ++first2;
-      } // end else
-
-      ++result;
-    } // end while
-  
-    return result + thrust::max(last1 - first1,last2 - first2);
-  }
-}; // end serial_bounded_set_union
-
-
-} // end namespace set_union_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2, 
-	 typename RandomAccessIterator3,
-         typename Compare>
-__host__ __device__
-RandomAccessIterator3 set_union(execution_policy<DerivedPolicy> &exec,
-                                RandomAccessIterator1 first1,
-                                RandomAccessIterator1 last1,
-                                RandomAccessIterator2 first2,
-                                RandomAccessIterator2 last2,
-                                RandomAccessIterator3 result,
-                                Compare comp)
-{
-  struct workaround
-  {
-    __host__ __device__
-    static RandomAccessIterator3 parallel_path(execution_policy<DerivedPolicy> &exec,
-                                               RandomAccessIterator1 first1,
-                                               RandomAccessIterator1 last1,
-                                               RandomAccessIterator2 first2,
-                                               RandomAccessIterator2 last2,
-                                               RandomAccessIterator3 result,
-                                               Compare comp)
-    {
-      return thrust::system::cuda::detail::detail::set_operation(exec, first1, last1, first2, last2, result, comp, set_union_detail::serial_bounded_set_union());
-    }
-
-    __host__ __device__
-    static RandomAccessIterator3 sequential_path(execution_policy<DerivedPolicy> &,
-                                                 RandomAccessIterator1 first1,
-                                                 RandomAccessIterator1 last1,
-                                                 RandomAccessIterator2 first2,
-                                                 RandomAccessIterator2 last2,
-                                                 RandomAccessIterator3 result,
-                                                 Compare comp)
-    {
-      return thrust::set_union(thrust::seq, first1, last1, first2, last2, result, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  return workaround::parallel_path(exec, first1, last1, first2, last2, result, comp);
-#else
-  return workaround::sequential_path(exec, first1, last1, first2, last2, result, comp);
-#endif
-} // end set_union
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/sort.h b/thrust/system/cuda/detail/sort.h
index 1e66a82d6..db4c211b3 100644
--- a/thrust/system/cuda/detail/sort.h
+++ b/thrust/system/cuda/detail/sort.h
@@ -1,60 +1,628 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
+
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/distance.h>
+#include <thrust/extrema.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/core/util.h>
 #include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
 
-namespace thrust
-{
-namespace system
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/integer_math.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/trivial_sequence.h>
+
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_merge_sort.cuh>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __merge_sort {
+
+  template <class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void*        d_temp_storage,
+            size_t&      temp_storage_bytes,
+            KeysIt       keys,
+            ItemsIt      ,
+            Size         keys_count,
+            CompareOp    compare_op,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, false> /* sort_keys */)
+  {
+    using ItemsInputIt = cub::NullType *;
+    ItemsInputIt items = nullptr;
+
+    using DispatchMergeSortT = cub::DispatchMergeSort<KeysIt,
+                                                      ItemsInputIt,
+                                                      KeysIt,
+                                                      ItemsInputIt,
+                                                      Size,
+                                                      CompareOp>;
+
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream);
+  }
+
+  template <class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
+            cudaStream_t stream,
+            thrust::detail::integral_constant<bool, true> /* sort_items */)
+  {
+    using DispatchMergeSortT =
+      cub::DispatchMergeSort<KeysIt, ItemsIt, KeysIt, ItemsIt, Size, CompareOp>;
+
+    return DispatchMergeSortT::Dispatch(d_temp_storage,
+                                        temp_storage_bytes,
+                                        keys,
+                                        items,
+                                        keys,
+                                        items,
+                                        keys_count,
+                                        compare_op,
+                                        stream);
+  }
+
+  template <class SORT_ITEMS,
+            class /* STABLE */,
+            class KeysIt,
+            class ItemsIt,
+            class Size,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION cudaError_t
+  doit_step(void *d_temp_storage,
+            size_t &temp_storage_bytes,
+            KeysIt keys,
+            ItemsIt items,
+            Size keys_count,
+            CompareOp compare_op,
+            cudaStream_t stream)
+  {
+    if (keys_count == 0)
+    {
+      return cudaSuccess;
+    }
+
+    thrust::detail::integral_constant<bool, SORT_ITEMS::value> sort_items{};
+
+    return doit_step(d_temp_storage,
+                     temp_storage_bytes,
+                     keys,
+                     items,
+                     keys_count,
+                     compare_op,
+                     stream,
+                     sort_items);
+  }
+
+  template <typename SORT_ITEMS,
+            typename STABLE,
+            typename Derived,
+            typename KeysIt,
+            typename ItemsIt,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  void merge_sort(execution_policy<Derived>& policy,
+                  KeysIt                     keys_first,
+                  KeysIt                     keys_last,
+                  ItemsIt                    items_first,
+                  CompareOp                  compare_op)
+
+  {
+    typedef typename iterator_traits<KeysIt>::difference_type size_type;
+
+    size_type count = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+
+    size_t       storage_size = 0;
+    cudaStream_t stream       = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = doit_step<SORT_ITEMS, STABLE>(NULL,
+                                           storage_size,
+                                           keys_first,
+                                           items_first,
+                                           count,
+                                           compare_op,
+                                           stream);
+    cuda_cub::throw_on_error(status, "merge_sort: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = doit_step<SORT_ITEMS, STABLE>(ptr,
+                                           storage_size,
+                                           keys_first,
+                                           items_first,
+                                           count,
+                                           compare_op,
+                                           stream);
+    cuda_cub::throw_on_error(status, "merge_sort: failed on 2nd step");
+
+    status = cuda_cub::synchronize_optional(policy);
+    cuda_cub::throw_on_error(status, "merge_sort: failed to synchronize");
+  }
+}    // namespace __merge_sort
+
+namespace __radix_sort {
+
+  template <class SORT_ITEMS, class Comparator>
+  struct dispatch;
+
+  // sort keys in ascending order
+  template <class K>
+  struct dispatch<thrust::detail::false_type, thrust::less<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& /*items_buffer*/,
+         Size                     count,
+         cudaStream_t             stream)
+    {
+      return cub::DeviceRadixSort::SortKeys(d_temp_storage,
+                                            temp_storage_bytes,
+                                            keys_buffer,
+                                            static_cast<int>(count),
+                                            0,
+                                            static_cast<int>(sizeof(Key) * 8),
+                                            stream);
+    }
+  }; // struct dispatch -- sort keys in ascending order;
+
+  // sort keys in descending order
+  template <class K>
+  struct dispatch<thrust::detail::false_type, thrust::greater<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& /*items_buffer*/,
+         Size                     count,
+         cudaStream_t             stream)
+    {
+      return cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
+                                                      temp_storage_bytes,
+                                                      keys_buffer,
+                                                      static_cast<int>(count),
+                                                      0,
+                                                      static_cast<int>(sizeof(Key) * 8),
+                                                      stream);
+    }
+  }; // struct dispatch -- sort keys in descending order;
+
+  // sort pairs in ascending order
+  template <class K>
+  struct dispatch<thrust::detail::true_type, thrust::less<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& items_buffer,
+         Size                     count,
+         cudaStream_t             stream)
+    {
+      return cub::DeviceRadixSort::SortPairs(d_temp_storage,
+                                             temp_storage_bytes,
+                                             keys_buffer,
+                                             items_buffer,
+                                             static_cast<int>(count),
+                                             0,
+                                             static_cast<int>(sizeof(Key) * 8),
+                                             stream);
+    }
+  }; // struct dispatch -- sort pairs in ascending order;
+
+  // sort pairs in descending order
+  template <class K>
+  struct dispatch<thrust::detail::true_type, thrust::greater<K> >
+  {
+    template <class Key, class Item, class Size>
+    THRUST_RUNTIME_FUNCTION static cudaError_t
+    doit(void*                    d_temp_storage,
+         size_t&                  temp_storage_bytes,
+         cub::DoubleBuffer<Key>&  keys_buffer,
+         cub::DoubleBuffer<Item>& items_buffer,
+         Size                     count,
+         cudaStream_t             stream)
+    {
+      return cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
+                                                       temp_storage_bytes,
+                                                       keys_buffer,
+                                                       items_buffer,
+                                                       static_cast<int>(count),
+                                                       0,
+                                                       static_cast<int>(sizeof(Key) * 8),
+                                                       stream);
+    }
+  }; // struct dispatch -- sort pairs in descending order;
+
+  template <typename SORT_ITEMS,
+            typename Derived,
+            typename Key,
+            typename Item,
+            typename Size,
+            typename CompareOp>
+  THRUST_RUNTIME_FUNCTION
+  void radix_sort(execution_policy<Derived>& policy,
+                  Key*                       keys,
+                  Item*                      items,
+                  Size                       count,
+                  CompareOp)
+  {
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cub::DoubleBuffer<Key>  keys_buffer(keys, NULL);
+    cub::DoubleBuffer<Item> items_buffer(items, NULL);
+
+    Size keys_count = count;
+    Size items_count = SORT_ITEMS::value ? count : 0;
+
+    cudaError_t status;
+
+    status = dispatch<SORT_ITEMS, CompareOp>::doit(NULL,
+                                                   temp_storage_bytes,
+                                                   keys_buffer,
+                                                   items_buffer,
+                                                   keys_count,
+                                                   stream);
+    cuda_cub::throw_on_error(status, "radix_sort: failed on 1st step");
+
+    size_t keys_temp_storage  = core::align_to(sizeof(Key) * keys_count, 128);
+    size_t items_temp_storage = core::align_to(sizeof(Item) * items_count, 128);
+
+    size_t storage_size = keys_temp_storage
+                        + items_temp_storage
+                        + temp_storage_bytes;
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+
+    keys_buffer.d_buffers[1]  = thrust::detail::aligned_reinterpret_cast<Key*>(
+      tmp.data().get()
+    );
+    items_buffer.d_buffers[1] = thrust::detail::aligned_reinterpret_cast<Item*>(
+      tmp.data().get() + keys_temp_storage
+    );
+    void *ptr = static_cast<void*>(
+      tmp.data().get() + keys_temp_storage + items_temp_storage
+    );
+
+    status = dispatch<SORT_ITEMS, CompareOp>::doit(ptr,
+                                                   temp_storage_bytes,
+                                                   keys_buffer,
+                                                   items_buffer,
+                                                   keys_count,
+                                                   stream);
+    cuda_cub::throw_on_error(status, "radix_sort: failed on 2nd step");
+
+    if (keys_buffer.selector != 0)
+    {
+      Key* temp_ptr = reinterpret_cast<Key*>(keys_buffer.d_buffers[1]);
+      cuda_cub::copy_n(policy, temp_ptr, keys_count, keys);
+    }
+    THRUST_IF_CONSTEXPR(SORT_ITEMS::value)
+    {
+      if (items_buffer.selector != 0)
+      {
+        Item *temp_ptr = reinterpret_cast<Item *>(items_buffer.d_buffers[1]);
+        cuda_cub::copy_n(policy, temp_ptr, items_count, items);
+      }
+    }
+  }
+}    // __radix_sort
+
+//---------------------------------------------------------------------
+// Smart sort picks at compile-time whether to dispatch radix or merge sort
+//---------------------------------------------------------------------
+
+namespace __smart_sort {
+
+  template <class Key, class CompareOp>
+  struct can_use_primitive_sort
+      : thrust::detail::and_<
+            thrust::detail::is_arithmetic<Key>,
+            thrust::detail::or_<
+                thrust::detail::is_same<CompareOp, thrust::less<Key> >,
+                thrust::detail::is_same<CompareOp, thrust::greater<Key> > > > {};
+
+  template <class Iterator, class CompareOp>
+  struct enable_if_primitive_sort
+      : thrust::detail::enable_if<
+            can_use_primitive_sort<typename iterator_value<Iterator>::type,
+                                   CompareOp>::value> {};
+
+  template <class Iterator, class CompareOp>
+  struct enable_if_comparison_sort
+      : thrust::detail::disable_if<
+            can_use_primitive_sort<typename iterator_value<Iterator>::type,
+                                   CompareOp>::value> {};
+
+
+  template <class SORT_ITEMS,
+            class STABLE,
+            class Policy,
+            class KeysIt,
+            class ItemsIt,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION typename enable_if_comparison_sort<KeysIt, CompareOp>::type
+  smart_sort(Policy&   policy,
+             KeysIt    keys_first,
+             KeysIt    keys_last,
+             ItemsIt   items_first,
+             CompareOp compare_op)
+  {
+    __merge_sort::merge_sort<SORT_ITEMS, STABLE>(policy,
+                                                 keys_first,
+                                                 keys_last,
+                                                 items_first,
+                                                 compare_op);
+
+  }
+
+  template <class SORT_ITEMS,
+            class STABLE,
+            class Policy,
+            class KeysIt,
+            class ItemsIt,
+            class CompareOp>
+  THRUST_RUNTIME_FUNCTION typename enable_if_primitive_sort<KeysIt, CompareOp>::type
+  smart_sort(execution_policy<Policy>& policy,
+             KeysIt                    keys_first,
+             KeysIt                    keys_last,
+             ItemsIt                   items_first,
+             CompareOp                 compare_op)
+  {
+    // ensure sequences have trivial iterators
+    thrust::detail::trivial_sequence<KeysIt, Policy>
+        keys(policy, keys_first, keys_last);
+
+    if (SORT_ITEMS::value)
+    {
+      thrust::detail::trivial_sequence<ItemsIt, Policy>
+          values(policy, items_first, items_first + (keys_last - keys_first));
+
+      __radix_sort::radix_sort<SORT_ITEMS>(
+          policy,
+          thrust::raw_pointer_cast(&*keys.begin()),
+          thrust::raw_pointer_cast(&*values.begin()),
+          keys_last - keys_first,
+          compare_op);
+
+      if (!is_contiguous_iterator<ItemsIt>::value)
+      {
+        cuda_cub::copy(policy, values.begin(), values.end(), items_first);
+      }
+    }
+    else
+    {
+      __radix_sort::radix_sort<SORT_ITEMS>(
+          policy,
+          thrust::raw_pointer_cast(&*keys.begin()),
+          thrust::raw_pointer_cast(&*keys.begin()),
+          keys_last - keys_first,
+          compare_op);
+    }
+
+    // copy results back, if necessary
+    if (!is_contiguous_iterator<KeysIt>::value)
+    {
+      cuda_cub::copy(policy, keys.begin(), keys.end(), keys_first);
+    }
+
+    cuda_cub::throw_on_error(
+      cuda_cub::synchronize_optional(policy),
+      "smart_sort: failed to synchronize");
+  }
+}    // namespace __smart_sort
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived, class ItemsIt, class CompareOp>
+void __host__ __device__
+sort(execution_policy<Derived>& policy,
+     ItemsIt                    first,
+     ItemsIt                    last,
+     CompareOp                  compare_op)
 {
-namespace cuda
+  THRUST_CDP_DISPATCH(
+    (using item_t = thrust::iterator_value_t<ItemsIt>; item_t *null_ = nullptr;
+     __smart_sort::smart_sort<thrust::detail::false_type,
+                              thrust::detail::false_type>(policy,
+                                                          first,
+                                                          last,
+                                                          null_,
+                                                          compare_op);),
+    (thrust::sort(cvt_to_seq(derived_cast(policy)), first, last, compare_op);));
+}
+
+__thrust_exec_check_disable__
+template <class Derived, class ItemsIt, class CompareOp>
+void __host__ __device__
+stable_sort(execution_policy<Derived>& policy,
+            ItemsIt                    first,
+            ItemsIt                    last,
+            CompareOp                  compare_op)
 {
-namespace detail
+  THRUST_CDP_DISPATCH(
+    (using item_t = thrust::iterator_value_t<ItemsIt>; item_t *null_ = nullptr;
+     __smart_sort::smart_sort<thrust::detail::false_type,
+                              thrust::detail::true_type>(policy,
+                                                         first,
+                                                         last,
+                                                         null_,
+                                                         compare_op);),
+    (thrust::stable_sort(cvt_to_seq(derived_cast(policy)),
+                         first,
+                         last,
+                         compare_op);));
+}
+
+__thrust_exec_check_disable__
+template <class Derived, class KeysIt, class ValuesIt, class CompareOp>
+void __host__ __device__
+sort_by_key(execution_policy<Derived>& policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values,
+            CompareOp                  compare_op)
 {
+  THRUST_CDP_DISPATCH(
+    (__smart_sort::smart_sort<thrust::detail::true_type,
+                              thrust::detail::false_type>(policy,
+                                                          keys_first,
+                                                          keys_last,
+                                                          values,
+                                                          compare_op);),
+    (thrust::sort_by_key(cvt_to_seq(derived_cast(policy)),
+                         keys_first,
+                         keys_last,
+                         values,
+                         compare_op);));
+}
 
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeysIt,
+          class ValuesIt,
+          class CompareOp>
+void __host__ __device__
+stable_sort_by_key(execution_policy<Derived> &policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values,
+            CompareOp                  compare_op)
+{
+  THRUST_CDP_DISPATCH(
+    (__smart_sort::smart_sort<thrust::detail::true_type,
+                              thrust::detail::true_type>(policy,
+                                                         keys_first,
+                                                         keys_last,
+                                                         values,
+                                                         compare_op);),
+    (thrust::stable_sort_by_key(cvt_to_seq(derived_cast(policy)),
+                                keys_first,
+                                keys_last,
+                                values,
+                                compare_op);));
+}
 
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_sort(execution_policy<DerivedPolicy> &exec,
-                 RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp);
+// API with default comparator
 
+template <class Derived, class ItemsIt>
+void __host__ __device__
+sort(execution_policy<Derived>& policy,
+     ItemsIt                    first,
+     ItemsIt                    last)
+{
+  typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+  cuda_cub::sort(policy, first, last, less<item_type>());
+}
 
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator1 keys_first,
-                        RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        StrictWeakOrdering comp);
+template <class Derived, class ItemsIt>
+void __host__ __device__
+stable_sort(execution_policy<Derived>& policy,
+            ItemsIt                    first,
+            ItemsIt                    last)
+{
+  typedef typename thrust::iterator_value<ItemsIt>::type item_type;
+  cuda_cub::stable_sort(policy, first, last, less<item_type>());
+}
 
+template <class Derived, class KeysIt, class ValuesIt>
+void __host__ __device__
+sort_by_key(execution_policy<Derived>& policy,
+            KeysIt                     keys_first,
+            KeysIt                     keys_last,
+            ValuesIt                   values)
+{
+  typedef typename thrust::iterator_value<KeysIt>::type key_type;
+  cuda_cub::sort_by_key(policy, keys_first, keys_last, values, less<key_type>());
+}
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+template <class Derived, class KeysIt, class ValuesIt>
+void __host__ __device__
+stable_sort_by_key(
+    execution_policy<Derived>& policy, KeysIt keys_first, KeysIt keys_last, ValuesIt values)
+{
+  typedef typename thrust::iterator_value<KeysIt>::type key_type;
+  cuda_cub::stable_sort_by_key(policy, keys_first, keys_last, values, less<key_type>());
+}
 
-#include <thrust/system/cuda/detail/sort.inl>
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/sort.inl b/thrust/system/cuda/detail/sort.inl
deleted file mode 100644
index 0aff7beb4..000000000
--- a/thrust/system/cuda/detail/sort.inl
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-
-/*! \file sort.inl
- *  \brief Inline file for sort.h
- */
-
-#include <thrust/system/cuda/detail/detail/stable_merge_sort.h>
-#include <thrust/system/cuda/detail/detail/stable_primitive_sort.h>
-
-#include <thrust/reverse.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-#include <thrust/detail/trivial_sequence.h>
-#include <thrust/detail/copy.h>
-#include <thrust/detail/seq.h>
-#include <thrust/sort.h>
-#include <thrust/system/cuda/detail/bulk.h>
-
-
-/*
- *  This file implements the following dispatch procedure for cuda::stable_sort()
- *  and cuda::stable_sort_by_key(). The first level inspects the KeyType
- *  and StrictWeakOrdering to determine whether a sort assuming primitive-typed
- *  data may be applied.
- *
- *  If a sort assuming primitive-typed data can be applied (i.e., a radix sort),
- *  the input ranges are first trivialized (turned into simple contiguous ranges
- *  if they are not already). To implement descending orderings, an ascending
- *  sort will be reversed.
- *
- *  If a sort assuming primitive-typed data cannot be applied, a comparison-based
- *  sort is used. Depending on the size of the key and value types, one level of
- *  indirection may be applied to their input ranges. This transformation
- *  may be applied to either range to convert an ill-suited problem (i.e. sorting with
- *  large keys or large value) into a problem more amenable to the underlying
- *  merge sort algorithm.
- */
-
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace stable_sort_detail
-{
-
-
-template<typename KeyType, typename StrictWeakCompare>
-  struct can_use_primitive_sort
-    : thrust::detail::and_<
-        thrust::detail::is_arithmetic<KeyType>,
-        thrust::detail::or_<
-          thrust::detail::is_same<StrictWeakCompare,thrust::less<KeyType> >,
-          thrust::detail::is_same<StrictWeakCompare,thrust::greater<KeyType> >
-        >
-      >
-{};
-
-
-template<typename RandomAccessIterator, typename StrictWeakCompare>
-  struct enable_if_primitive_sort
-    : thrust::detail::enable_if<
-        can_use_primitive_sort<
-          typename iterator_value<RandomAccessIterator>::type,
-          StrictWeakCompare
-        >::value
-      >
-{};
-
-
-template<typename RandomAccessIterator, typename StrictWeakCompare>
-  struct enable_if_comparison_sort
-    : thrust::detail::disable_if<
-        can_use_primitive_sort<
-          typename iterator_value<RandomAccessIterator>::type,
-          StrictWeakCompare
-        >::value
-      >
-{};
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-__host__ __device__
-typename enable_if_primitive_sort<RandomAccessIterator,StrictWeakOrdering>::type
-  stable_sort(execution_policy<DerivedPolicy> &exec,
-              RandomAccessIterator first,
-              RandomAccessIterator last,
-              StrictWeakOrdering comp)
-{
-  // ensure sequence has trivial iterators
-  thrust::detail::trivial_sequence<RandomAccessIterator,DerivedPolicy> keys(exec, first, last);
-
-  thrust::system::cuda::detail::detail::stable_primitive_sort(exec, keys.begin(), keys.end(), comp);
-  
-  // copy results back, if necessary
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator>::value)
-  {
-    thrust::copy(exec, keys.begin(), keys.end(), first);
-  }
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-__host__ __device__
-typename enable_if_comparison_sort<RandomAccessIterator,StrictWeakOrdering>::type
-  stable_sort(execution_policy<DerivedPolicy> &exec,
-              RandomAccessIterator first,
-              RandomAccessIterator last,
-              StrictWeakOrdering comp)
-{
-  thrust::system::cuda::detail::detail::stable_merge_sort(exec, first, last, comp);
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__host__ __device__
-typename enable_if_primitive_sort<RandomAccessIterator1,StrictWeakOrdering>::type
-  stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                     RandomAccessIterator1 keys_first,
-                     RandomAccessIterator1 keys_last,
-                     RandomAccessIterator2 values_first,
-                     StrictWeakOrdering comp)
-{
-  // ensure sequences have trivial iterators
-  thrust::detail::trivial_sequence<RandomAccessIterator1,DerivedPolicy> keys(exec, keys_first, keys_last);
-  thrust::detail::trivial_sequence<RandomAccessIterator2,DerivedPolicy> values(exec, values_first, values_first + (keys_last - keys_first));
-  
-  thrust::system::cuda::detail::detail::stable_primitive_sort_by_key(exec, keys.begin(), keys.end(), values.begin(), comp);
-  
-  // copy results back, if necessary
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator1>::value)
-  {
-    thrust::copy(exec, keys.begin(), keys.end(), keys_first);
-  }
-
-  if(!thrust::detail::is_trivial_iterator<RandomAccessIterator2>::value)
-  {
-    thrust::copy(exec, values.begin(), values.end(), values_first);
-  }
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__host__ __device__
-typename enable_if_comparison_sort<RandomAccessIterator1,StrictWeakOrdering>::type
-  stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                     RandomAccessIterator1 keys_first,
-                     RandomAccessIterator1 keys_last,
-                     RandomAccessIterator2 values_first,
-                     StrictWeakOrdering comp)
-{
-  thrust::system::cuda::detail::detail::stable_merge_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-}
-
-
-} // end namespace stable_sort_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_sort(execution_policy<DerivedPolicy> &exec,
-                 RandomAccessIterator first,
-                 RandomAccessIterator last,
-                 StrictWeakOrdering comp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static void parallel_path(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator first,
-                              RandomAccessIterator last,
-                              StrictWeakOrdering comp)
-    {
-      stable_sort_detail::stable_sort(exec, first, last, comp);
-    }
-
-    __host__ __device__
-    static void sequential_path(RandomAccessIterator first,
-                                RandomAccessIterator last,
-                                StrictWeakOrdering comp)
-    {
-      thrust::sort(thrust::seq, first, last, comp);
-    }
-  };
-
-#if __BULK_HAS_CUDART__
-  workaround::parallel_path(exec, first, last, comp);
-#else
-  workaround::sequential_path(first, last, comp);
-#endif
-}
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename RandomAccessIterator2,
-         typename StrictWeakOrdering>
-__host__ __device__
-void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
-                        RandomAccessIterator1 keys_first,
-                        RandomAccessIterator1 keys_last,
-                        RandomAccessIterator2 values_first,
-                        StrictWeakOrdering comp)
-{
-  // we're attempting to launch a kernel, assert we're compiling with nvcc
-  // ========================================================================
-  // X Note to the user: If you've found this line due to a compiler error, X
-  // X you need to compile your code using nvcc, rather than g++ or cl.exe  X
-  // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC>::value) );
-
-  struct workaround
-  {
-    __host__ __device__
-    static void parallel_path(execution_policy<DerivedPolicy> &exec,
-                              RandomAccessIterator1 keys_first,
-                              RandomAccessIterator1 keys_last,
-                              RandomAccessIterator2 values_first,
-                              StrictWeakOrdering comp)
-    {
-      stable_sort_detail::stable_sort_by_key(exec, keys_first, keys_last, values_first, comp);
-    }
-
-    __host__ __device__
-    static void sequential_path(RandomAccessIterator1 keys_first,
-                                RandomAccessIterator1 keys_last,
-                                RandomAccessIterator2 values_first,
-                                StrictWeakOrdering comp)
-    {
-      thrust::stable_sort_by_key(thrust::seq, keys_first, keys_last, values_first, comp);
-    }
-  };
-  
-#if __BULK_HAS_CUDART__
-  workaround::parallel_path(exec, keys_first, keys_last, values_first, comp);
-#else
-  workaround::sequential_path(keys_first, keys_last, values_first, comp);
-#endif
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/swap_ranges.h b/thrust/system/cuda/detail/swap_ranges.h
index c63bb0320..8f9e4fa8a 100644
--- a/thrust/system/cuda/detail/swap_ranges.h
+++ b/thrust/system/cuda/detail/swap_ranges.h
@@ -1,22 +1,102 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// cuda has no special swap_ranges
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/swap.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+namespace __swap_ranges {
+
+
+  template <class ItemsIt1, class ItemsIt2>
+  struct swap_f
+  {
+    ItemsIt1 items1;
+    ItemsIt2 items2;
+
+    typedef  typename iterator_traits<ItemsIt1>::value_type value1_type;
+    typedef  typename iterator_traits<ItemsIt2>::value_type value2_type;
+
+    THRUST_FUNCTION
+    swap_f(ItemsIt1 items1_, ItemsIt2 items2_)
+        : items1(items1_), items2(items2_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      value1_type item1 = items1[idx];
+      value2_type item2 = items2[idx];
+      // XXX thrust::swap is buggy
+      // if reference_type of ItemIt1/ItemsIt2
+      // is a proxy reference, then KABOOM!
+      // to avoid this, just copy the value first before swap
+      // *todo* specialize on real & proxy references
+      using thrust::swap;
+      swap(item1, item2);
+      items1[idx] = item1;
+      items2[idx] = item2;
+    }
+  };
+}    // namespace __swap_ranges
+
+template <class Derived,
+          class ItemsIt1,
+          class ItemsIt2>
+ItemsIt2 __host__ __device__
+swap_ranges(execution_policy<Derived> &policy,
+            ItemsIt1                   first1,
+            ItemsIt1                   last1,
+            ItemsIt2                   first2)
+{
+  typedef typename iterator_traits<ItemsIt1>::difference_type size_type;
+
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+
+  cuda_cub::parallel_for(policy,
+                         __swap_ranges::swap_f<ItemsIt1,
+                                               ItemsIt2>(first1, first2),
+                         num_items);
+
+  return first2 + num_items;
+}
+
+
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/synchronize.inl b/thrust/system/cuda/detail/synchronize.inl
deleted file mode 100644
index 2e2fbfb87..000000000
--- a/thrust/system/cuda/detail/synchronize.inl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system/cuda/detail/throw_on_error.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-inline __host__ __device__
-void synchronize(const char *message)
-{
-  throw_on_error(cudaDeviceSynchronize(), message);
-} // end synchronize()
-
-
-inline __host__ __device__
-void synchronize(cudaStream_t stream, const char *message)
-{
-#if !defined(__CUDA_ARCH__)
-  throw_on_error(cudaStreamSynchronize(stream), message);
-#else
-  synchronize(message);
-#endif
-}
-
-inline __host__ __device__
-void synchronize_if_enabled(const char *message)
-{
-// XXX this could potentially be a runtime decision
-//     note we always have to synchronize in __device__ code
-#if __THRUST_SYNCHRONOUS || defined(__CUDA_ARCH__)
-  synchronize(message);
-#else
-  // WAR "unused parameter" warning
-  (void) message;
-#endif
-} // end synchronize_if_enabled()
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/tabulate.h b/thrust/system/cuda/detail/tabulate.h
index c6ae90664..67edb8574 100644
--- a/thrust/system/cuda/detail/tabulate.h
+++ b/thrust/system/cuda/detail/tabulate.h
@@ -1,22 +1,83 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/distance.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+namespace __tabulate {
+
+  template <class Iterator, class TabulateOp, class Size>
+  struct functor
+  {
+    Iterator items;
+    TabulateOp op;
+
+    __host__ __device__
+    functor(Iterator items_, TabulateOp op_)
+        : items(items_), op(op_) {}
+
+    void __device__ operator()(Size idx)
+    {
+      items[idx] = op(idx);
+    }
+  };    // struct functor
+
+}    // namespace __tabulate
+
+template <class Derived,
+          class Iterator,
+          class TabulateOp>
+void __host__ __device__
+tabulate(execution_policy<Derived>& policy,
+         Iterator                   first,
+         Iterator                   last,
+         TabulateOp                 tabulate_op)
+{
+  typedef typename iterator_traits<Iterator>::difference_type size_type;
+
+  size_type count = thrust::distance(first, last);
+
+  typedef __tabulate::functor<Iterator, TabulateOp, size_type> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, tabulate_op),
+                         count);
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/temporary_buffer.h b/thrust/system/cuda/detail/temporary_buffer.h
index 2adfaf281..6b5276141 100644
--- a/thrust/system/cuda/detail/temporary_buffer.h
+++ b/thrust/system/cuda/detail/temporary_buffer.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2016 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
diff --git a/thrust/system/cuda/detail/temporary_indirect_permutation.h b/thrust/system/cuda/detail/temporary_indirect_permutation.h
deleted file mode 100644
index 94137d858..000000000
--- a/thrust/system/cuda/detail/temporary_indirect_permutation.h
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/temporary_array.h>
-#include <thrust/sequence.h>
-#include <thrust/gather.h>
-#include <thrust/detail/function.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  struct temporary_indirect_permutation
-{
-  private:
-    typedef unsigned int size_type;
-    typedef thrust::detail::temporary_array<size_type, DerivedPolicy> array_type;
-
-  public:
-    __host__ __device__
-    temporary_indirect_permutation(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last)
-      : m_exec(derived_cast(exec)),
-        m_src_first(first),
-        m_src_last(last),
-        m_permutation(0, m_exec, last - first)
-    {
-      // generate sorted index sequence
-      thrust::sequence(exec, m_permutation.begin(), m_permutation.end());
-    }
-
-    __host__ __device__
-    ~temporary_indirect_permutation()
-    {
-      // permute the source array using the indices
-      typedef typename thrust::iterator_value<RandomAccessIterator>::type value_type;
-      thrust::detail::temporary_array<value_type, DerivedPolicy> temp(m_exec, m_src_first, m_src_last);
-      thrust::gather(m_exec, m_permutation.begin(), m_permutation.end(), temp.begin(), m_src_first);
-    }
-
-    typedef typename array_type::iterator iterator;
-
-    __host__ __device__
-    iterator begin()
-    {
-      return m_permutation.begin();
-    }
-
-    __host__ __device__
-    iterator end()
-    {
-      return m_permutation.end();
-    }
-
-  private:
-    DerivedPolicy &m_exec;
-    RandomAccessIterator m_src_first, m_src_last;
-    thrust::detail::temporary_array<size_type, DerivedPolicy> m_permutation;
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator>
-  struct iterator_range_with_execution_policy
-{
-  __host__ __device__
-  iterator_range_with_execution_policy(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last)
-    : m_exec(derived_cast(exec)), m_first(first), m_last(last)
-  {}
-
-  typedef RandomAccessIterator iterator;
-
-  __host__ __device__
-  iterator begin()
-  {
-    return m_first;
-  }
-
-  __host__ __device__
-  iterator end()
-  {
-    return m_last;
-  }
-
-  __host__ __device__
-  DerivedPolicy &exec()
-  {
-    return m_exec;
-  }
-
-  DerivedPolicy &m_exec;
-  RandomAccessIterator m_first, m_last;
-};
-
-
-template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator>
-  struct conditional_temporary_indirect_permutation
-    : thrust::detail::eval_if<
-        Condition::value,
-        thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
-        thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
-      >::type
-{
-  typedef typename thrust::detail::eval_if<
-    Condition::value,
-    thrust::detail::identity_<temporary_indirect_permutation<DerivedPolicy, RandomAccessIterator> >,
-    thrust::detail::identity_<iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> >
-  >::type super_t;
-
-  __host__ __device__
-  conditional_temporary_indirect_permutation(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last)
-    : super_t(exec, first, last)
-  {}
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct temporary_indirect_ordering
-    : temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator>
-{
-  private:
-    typedef temporary_indirect_permutation<DerivedPolicy,RandomAccessIterator> super_t;
-
-  public:
-    __host__ __device__
-    temporary_indirect_ordering(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-      : super_t(exec, first, last),
-        m_comp(first, comp)
-    {}
-
-    struct compare
-    {
-      RandomAccessIterator first;
-
-      thrust::detail::wrapped_function<
-        Compare,
-        bool
-      > comp;
-
-      __host__ __device__
-      compare(RandomAccessIterator first, Compare comp)
-        : first(first), comp(comp)
-      {}
-
-      template<typename Integral>
-      __host__ __device__
-      bool operator()(Integral a, Integral b)
-      {
-        return comp(first[a], first[b]);
-      }
-    };
-
-    __host__ __device__
-    compare comp() const
-    {
-      return m_comp;
-    }
-
-  private:
-    compare m_comp;
-};
-
-
-template<typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct iterator_range_with_execution_policy_and_compare
-    : iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator>
-{
-  typedef iterator_range_with_execution_policy<DerivedPolicy, RandomAccessIterator> super_t;
-
-  __host__ __device__
-  iterator_range_with_execution_policy_and_compare(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-    : super_t(exec, first, last), m_comp(comp)
-  {}
-
-  typedef Compare compare;
-
-  __host__ __device__
-  compare comp()
-  {
-    return m_comp;
-  }
-
-  Compare m_comp;
-};
-
-
-template<typename Condition, typename DerivedPolicy, typename RandomAccessIterator, typename Compare>
-  struct conditional_temporary_indirect_ordering
-    : thrust::detail::eval_if<
-        Condition::value,
-        thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
-        thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
-      >::type
-{
-  typedef typename thrust::detail::eval_if<
-    Condition::value,
-    thrust::detail::identity_<temporary_indirect_ordering<DerivedPolicy, RandomAccessIterator, Compare> >,
-    thrust::detail::identity_<iterator_range_with_execution_policy_and_compare<DerivedPolicy, RandomAccessIterator, Compare> >
-  >::type super_t;
-
-  __host__ __device__
-  conditional_temporary_indirect_ordering(thrust::execution_policy<DerivedPolicy> &exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp)
-    : super_t(exec, first, last, comp)
-  {}
-};
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/detail/terminate.h b/thrust/system/cuda/detail/terminate.h
index d9d657817..226c9d5ac 100644
--- a/thrust/system/cuda/detail/terminate.h
+++ b/thrust/system/cuda/detail/terminate.h
@@ -1,26 +1,37 @@
-/*
- *  Copyright 2008-2012 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
+ ******************************************************************************/
 
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/bulk.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <cstdio>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace cuda
@@ -32,19 +43,20 @@ namespace detail
 inline __device__
 void terminate()
 {
-  thrust::system::cuda::detail::bulk_::detail::terminate();
+  thrust::cuda_cub::terminate();
 }
 
 
-__host__ __device__
-inline void terminate_with_message(const char* message)
+inline __host__ __device__
+void terminate_with_message(const char* message)
 {
-  thrust::system::cuda::detail::bulk_::detail::terminate_with_message(message);
+  printf("%s\n", message);
+  thrust::cuda_cub::terminate();
 }
 
 
 } // end detail
 } // end cuda
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/transform.h b/thrust/system/cuda/detail/transform.h
index 39e224e09..3cf171a47 100644
--- a/thrust/system/cuda/detail/transform.h
+++ b/thrust/system/cuda/detail/transform.h
@@ -1,22 +1,416 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// cuda has no special transform
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <thrust/system/cuda/config.h>
+
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/detail/type_traits/result_of_adaptable_function.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+
+namespace __transform {
+
+  struct no_stencil_tag
+  {
+  };
+
+  struct always_true_predicate
+  {
+    template <class T>
+    bool THRUST_DEVICE_FUNCTION operator()(T const &) const
+    {
+      return true;
+    }
+  };
+
+  template <class InputIt,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  struct unary_transform_f
+  {
+    InputIt     input;
+    OutputIt    output;
+    StencilIt   stencil;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    unary_transform_f(InputIt     input_,
+                      OutputIt    output_,
+                      StencilIt   stencil_,
+                      TransformOp op_,
+                      Predicate   pred_)
+        : input(input_),
+          output(output_),
+          stencil(stencil_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(stencil[idx])))
+        output[idx] = op(raw_reference_cast(input[idx]));
+    }
+  }; // struct unary_transform_stencil_f
+
+  template <class InputIt,
+            class OutputIt,
+            class TransformOp,
+            class Predicate>
+  struct unary_transform_f<InputIt,
+                           OutputIt,
+                           no_stencil_tag,
+                           TransformOp,
+                           Predicate>
+  {
+    InputIt     input;
+    OutputIt    output;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    unary_transform_f(InputIt        input_,
+                      OutputIt       output_,
+                      no_stencil_tag,
+                      TransformOp    op_,
+                      Predicate      pred_)
+        : input(input_), output(output_), op(op_), pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(input[idx])))
+        output[idx] = op(raw_reference_cast(input[idx]));
+    }
+  }; // struct unary_transform_f
+
+  template <class InputIt1,
+            class InputIt2,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  struct binary_transform_f
+  {
+    InputIt1    input1;
+    InputIt2    input2;
+    OutputIt    output;
+    StencilIt   stencil;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    binary_transform_f(InputIt1    input1_,
+                       InputIt2    input2_,
+                       OutputIt    output_,
+                       StencilIt   stencil_,
+                       TransformOp op_,
+                       Predicate   pred_)
+        : input1(input1_),
+          input2(input2_),
+          output(output_),
+          stencil(stencil_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(stencil[idx])))
+        output[idx] = op(raw_reference_cast(input1[idx]),
+                         raw_reference_cast(input2[idx]));
+    }
+  }; // struct binary_transform_stencil_f
+
+  template <class InputIt1,
+            class InputIt2,
+            class OutputIt,
+            class TransformOp,
+            class Predicate>
+  struct binary_transform_f<InputIt1,
+                            InputIt2,
+                            OutputIt,
+                            no_stencil_tag,
+                            TransformOp,
+                            Predicate>
+  {
+    InputIt1    input1;
+    InputIt2    input2;
+    OutputIt    output;
+    TransformOp op;
+    Predicate   pred;
+
+    THRUST_FUNCTION
+    binary_transform_f(InputIt1       input1_,
+                       InputIt2       input2_,
+                       OutputIt       output_,
+                       no_stencil_tag ,
+                       TransformOp    op_,
+                       Predicate      pred_)
+        : input1(input1_),
+          input2(input2_),
+          output(output_),
+          op(op_),
+          pred(pred_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      if (pred(raw_reference_cast(input1[idx])))
+        output[idx] = op(raw_reference_cast(input1[idx]),
+                         raw_reference_cast(input2[idx]));
+    }
+  }; // struct binary_transform_f
+
+  template <class Policy,
+            class InputIt,
+            class Size,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  OutputIt THRUST_FUNCTION
+  unary(Policy &     policy,
+        InputIt      items,
+        OutputIt     result,
+        Size         num_items,
+        StencilIt    stencil,
+        TransformOp  transform_op,
+        Predicate    predicate)
+  {
+    if (num_items == 0)
+      return result;
+
+    typedef unary_transform_f<InputIt,
+                              OutputIt,
+                              StencilIt,
+                              TransformOp,
+                              Predicate>
+        unary_transform_t;
+
+    cuda_cub::parallel_for(policy,
+                           unary_transform_t(items,
+                                             result,
+                                             stencil,
+                                             transform_op,
+                                             predicate),
+                           num_items);
+
+    return result + num_items;
+  }
+
+  template <class Policy,
+            class InputIt1,
+            class InputIt2,
+            class Size,
+            class OutputIt,
+            class StencilIt,
+            class TransformOp,
+            class Predicate>
+  OutputIt THRUST_FUNCTION
+  binary(Policy &    policy,
+         InputIt1    items1,
+         InputIt2    items2,
+         OutputIt    result,
+         Size        num_items,
+         StencilIt   stencil,
+         TransformOp transform_op,
+         Predicate   predicate)
+  {
+    if (num_items == 0)
+      return result;
+
+    typedef binary_transform_f<InputIt1,
+                               InputIt2,
+                               OutputIt,
+                               StencilIt,
+                               TransformOp,
+                               Predicate>
+        binary_transform_t;
+
+    cuda_cub::parallel_for(policy,
+                           binary_transform_t(items1,
+                                              items2,
+                                              result,
+                                              stencil,
+                                              transform_op,
+                                              predicate),
+                           num_items);
+
+    return result + num_items;
+  }
+
+}    // namespace __transform
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+//-------------------------
+//  one input data stream
+//-------------------------
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class StencilInputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             StencilInputIt             stencil,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  return __transform::unary(policy,
+                            first,
+                            result,
+                            num_items,
+                            stencil,
+                            transform_op,
+                            predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt                    first,
+             InputIt                    last,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  return cuda_cub::transform_if(policy,
+                                first,
+                                last,
+                                __transform::no_stencil_tag(),
+                                result,
+                                transform_op,
+                                predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived> &policy,
+          InputIt                    first,
+          InputIt                    last,
+          OutputIt                   result,
+          TransformOp                transform_op)
+{
+  return cuda_cub::transform_if(policy,
+                                first,
+                                last,
+                                result,
+                                transform_op,
+                                __transform::always_true_predicate());
+} // func transform
+
+//-------------------------
+// two input data streams
+//-------------------------
+
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class StencilInputIt,
+          class OutputIt,
+          class TransformOp,
+          class Predicate>
+OutputIt THRUST_FUNCTION
+transform_if(execution_policy<Derived> &policy,
+             InputIt1                   first1,
+             InputIt1                   last1,
+             InputIt2                   first2,
+             StencilInputIt             stencil,
+             OutputIt                   result,
+             TransformOp                transform_op,
+             Predicate                  predicate)
+{
+  typedef typename iterator_traits<InputIt1>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first1, last1));
+  return __transform::binary(policy,
+                             first1,
+                             first2,
+                             result,
+                             num_items,
+                             stencil,
+                             transform_op,
+                             predicate);
+}    // func transform_if
+
+template <class Derived,
+          class InputIt1,
+          class InputIt2,
+          class OutputIt,
+          class TransformOp>
+OutputIt THRUST_FUNCTION
+transform(execution_policy<Derived> &policy,
+          InputIt1                   first1,
+          InputIt1                   last1,
+          InputIt2                   first2,
+          OutputIt                   result,
+          TransformOp                transform_op)
+{
+  return cuda_cub::transform_if(policy,
+                                first1,
+                                last1,
+                                first2,
+                                __transform::no_stencil_tag(),
+                                result,
+                                transform_op,
+                                __transform::always_true_predicate());
+} // func transform
+
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/transform_reduce.h b/thrust/system/cuda/detail/transform_reduce.h
index c6ae90664..60efaae5a 100644
--- a/thrust/system/cuda/detail/transform_reduce.h
+++ b/thrust/system/cuda/detail/transform_reduce.h
@@ -1,22 +1,68 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/distance.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class TransformOp,
+          class T,
+          class ReduceOp>
+T __host__ __device__
+transform_reduce(execution_policy<Derived> &policy,
+                 InputIt                    first,
+                 InputIt                    last,
+                 TransformOp                transform_op,
+                 T                          init,
+                 ReduceOp                   reduce_op)
+{
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<T,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::reduce_n(policy,
+                            transformed_iterator_t(first, transform_op),
+                            num_items,
+                            init,
+                            reduce_op);
+}
 
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/transform_scan.h b/thrust/system/cuda/detail/transform_scan.h
index c6ae90664..8f14ca8f7 100644
--- a/thrust/system/cuda/detail/transform_scan.h
+++ b/thrust/system/cuda/detail/transform_scan.h
@@ -1,22 +1,109 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/detail/type_traits.h>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/scan.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class ScanOp>
+OutputIt __host__ __device__
+transform_inclusive_scan(execution_policy<Derived> &policy,
+                         InputIt                    first,
+                         InputIt                    last,
+                         OutputIt                   result,
+                         TransformOp                transform_op,
+                         ScanOp                     scan_op)
+{
+  // Use the transformed input iterator's value type per https://wg21.link/P0571
+  using input_type = typename thrust::iterator_value<InputIt>::type;
+  using result_type = thrust::detail::invoke_result_t<TransformOp, input_type>;
+  using value_type = thrust::remove_cvref_t<result_type>;
+
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<value_type,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::inclusive_scan_n(policy,
+                                 transformed_iterator_t(first, transform_op),
+                                 num_items,
+                                 result,
+                                 scan_op);
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class TransformOp,
+          class InitialValueType,
+          class ScanOp>
+OutputIt __host__ __device__
+transform_exclusive_scan(execution_policy<Derived> &policy,
+                         InputIt                    first,
+                         InputIt                    last,
+                         OutputIt                   result,
+                         TransformOp                transform_op,
+                         InitialValueType           init,
+                         ScanOp                     scan_op)
+{
+  // Use the initial value type per https://wg21.link/P0571
+  using result_type = thrust::remove_cvref_t<InitialValueType>;
+
+  typedef typename iterator_traits<InputIt>::difference_type size_type;
+  size_type num_items = static_cast<size_type>(thrust::distance(first, last));
+  typedef transform_input_iterator_t<result_type,
+                                     InputIt,
+                                     TransformOp>
+      transformed_iterator_t;
+
+  return cuda_cub::exclusive_scan_n(policy,
+                                 transformed_iterator_t(first, transform_op),
+                                 num_items,
+                                 result,
+                                 init,
+                                 scan_op);
+}
+
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/trivial_copy.h b/thrust/system/cuda/detail/trivial_copy.h
deleted file mode 100644
index dea37ba39..000000000
--- a/thrust/system/cuda/detail/trivial_copy.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/execution_policy.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__host__ __device__
-void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
-                    RandomAccessIterator1 first,
-                    Size n,
-                    RandomAccessIterator2 result);
-
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-void trivial_copy_n(cross_system<System1,System2> &exec,
-                    RandomAccessIterator1 first,
-                    Size n,
-                    RandomAccessIterator2 result);
-
-
-} // end detail
-} // end cuda
-} // end system
-} // end thrust
-
-#include <thrust/system/cuda/detail/trivial_copy.inl>
-
diff --git a/thrust/system/cuda/detail/trivial_copy.inl b/thrust/system/cuda/detail/trivial_copy.inl
deleted file mode 100644
index 10a1cecb9..000000000
--- a/thrust/system/cuda/detail/trivial_copy.inl
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/trivial_copy.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <thrust/system_error.h>
-#include <thrust/system/cuda/error.h>
-#include <thrust/system/cuda/detail/throw_on_error.h>
-#include <thrust/system/cuda/detail/synchronize.h>
-#include <thrust/iterator/iterator_traits.h>
-#include <thrust/system/cpp/detail/execution_policy.h>
-#include <thrust/detail/raw_pointer_cast.h>
-#include <thrust/functional.h>
-#include <thrust/system/cuda/detail/execute_on_stream.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
-namespace trivial_copy_detail
-{
-
-inline void checked_cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream)
-{
-  cudaError_t error = cudaMemcpyAsync(dst,src,count,kind,stream);
-  if(error)
-  {
-    throw thrust::system_error(error, thrust::cuda_category());
-  } // end error
-} // end checked_cudaMemcpy()
-
-
-template<typename System1,
-         typename System2>
-cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy<System1> &,
-                                const thrust::cpp::execution_policy<System2> &)
-{
-  return cudaMemcpyDeviceToHost;
-} // end cuda_memcpy_kind()
-
-
-template<typename System1,
-         typename System2>
-cudaMemcpyKind cuda_memcpy_kind(const thrust::cpp::execution_policy<System1> &,
-                                const thrust::cuda::execution_policy<System2> &)
-{
-  return cudaMemcpyHostToDevice;
-} // end cuda_memcpy_kind()
-
-template<typename System>
-cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy<System> &,
-                                const thrust::cuda::execution_policy<System> &)
-{
-#if defined(_WIN32) && !defined(_WIN64)
-  // On Win32 we assume cudaMemcpyDeviceToDevice on copy with cuda::par
-  // and raw pointers. This is the only legal option in Win32 with cuda::par policy.
-  return cudaMemcpyDeviceToDevice;
-#else
-  // In 64-bit mode copy with cuda::par can legally accept both host and device raw pointers
-  // the memcopy kind will be decided by the CUDA runtime based on UVA space of the pointer.
-  return cudaMemcpyDefault;
-#endif
-} // end cuda_memcpy_kind()
-
-template<typename System1,
-         typename System2>
-cudaStream_t cuda_memcpy_stream(const thrust::cuda::execution_policy<System1> &exec,
-                                const thrust::cpp::execution_policy<System2> &)
-{
-  return stream(derived_cast(exec));
-} // end cuda_memcpy_stream()
-
-template<typename System1,
-         typename System2>
-cudaStream_t cuda_memcpy_stream(const thrust::cpp::execution_policy<System1> &,
-                                const thrust::cuda::execution_policy<System2> &exec)
-{
-  return stream(derived_cast(exec));
-} // end cuda_memcpy_stream()
-
-
-template<typename System>
-cudaStream_t cuda_memcpy_stream(const thrust::cuda::execution_policy<System> &,
-                                const thrust::cuda::execution_policy<System> &exec)
-{
-  return stream(derived_cast(exec));
-} // end cuda_memcpy_stream()
-
-
-
-
-
-} // end namespace trivial_copy_detail
-
-
-template<typename DerivedPolicy,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-__host__ __device__
-void trivial_copy_n(execution_policy<DerivedPolicy> &exec,
-                    RandomAccessIterator1 first,
-                    Size n,
-                    RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-#ifndef __CUDA_ARCH__
-  void *dst = thrust::raw_pointer_cast(&*result);
-  const void *src = thrust::raw_pointer_cast(&*first);
-
-  // since the user may have given thrust::cuda::par to thrust::copy explicitly,
-  // this copy may be a cross-space copy that has bypassed system dispatch
-  // we need to have cudaMemcpyAsync figure out the directionality of the copy dynamically
-  // using cudaMemcpyDefault
-
-  cudaMemcpyKind kind = trivial_copy_detail::cuda_memcpy_kind(thrust::detail::derived_cast(exec), thrust::detail::derived_cast(exec));
-  trivial_copy_detail::checked_cudaMemcpyAsync(dst, src, n * sizeof(T), kind, stream(thrust::detail::derived_cast(exec)));
-#else
-  thrust::transform(exec, first, first + n, result, thrust::identity<T>());
-#endif
-}
-
-
-template<typename System1,
-         typename System2,
-         typename RandomAccessIterator1,
-         typename Size,
-         typename RandomAccessIterator2>
-void trivial_copy_n(cross_system<System1,System2> &systems,
-                    RandomAccessIterator1 first,
-                    Size n,
-                    RandomAccessIterator2 result)
-{
-  typedef typename thrust::iterator_value<RandomAccessIterator1>::type T;
-
-  void *dst = thrust::raw_pointer_cast(&*result);
-  const void *src = thrust::raw_pointer_cast(&*first);
-
-  cudaMemcpyKind kind = trivial_copy_detail::cuda_memcpy_kind(thrust::detail::derived_cast(systems.system1), thrust::detail::derived_cast(systems.system2));
-
-
-  // async host <-> device copy , but synchronize on a user provided stream
-  cudaStream_t s = trivial_copy_detail::cuda_memcpy_stream(derived_cast(systems.system1), derived_cast(systems.system2));
-  trivial_copy_detail::checked_cudaMemcpyAsync(dst, src, n * sizeof(T), kind, s);
-  synchronize(s, "failed synchronize in thrust::system::cuda::detail::trivial_copy_n");
-}
-
-
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
-
diff --git a/thrust/system/cuda/detail/uninitialized_copy.h b/thrust/system/cuda/detail/uninitialized_copy.h
index c6ae90664..f21b7c0d6 100644
--- a/thrust/system/cuda/detail/uninitialized_copy.h
+++ b/thrust/system/cuda/detail/uninitialized_copy.h
@@ -1,22 +1,111 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+namespace __uninitialized_copy {
+
+  template <class InputIt, class OutputIt>
+  struct functor
+  {
+    InputIt  input;
+    OutputIt output;
+
+    typedef typename iterator_traits<InputIt>::value_type  InputType;
+    typedef typename iterator_traits<OutputIt>::value_type OutputType;
+
+    THRUST_FUNCTION
+    functor(InputIt input_, OutputIt output_)
+        : input(input_), output(output_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      InputType const &in  = raw_reference_cast(input[idx]);
+      OutputType &     out = raw_reference_cast(output[idx]);
+
+#if defined(__CUDA__) && defined(__clang__)
+      // XXX unsafe, but clang is seemngly unable to call in-place new
+      out = in;
+#else
+      ::new (static_cast<void *>(&out)) OutputType(in);
+#endif
+    }
+  };    // struct functor
+
+}    // namespace __uninitialized_copy
+
+template <class Derived,
+          class InputIt,
+          class Size,
+          class OutputIt>
+OutputIt __host__ __device__
+uninitialized_copy_n(execution_policy<Derived> &policy,
+                     InputIt                    first,
+                     Size                       count,
+                     OutputIt                   result)
+{
+  typedef __uninitialized_copy::functor<InputIt,OutputIt> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, result),
+                         count);
+
+  return result + count;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+uninitialized_copy(execution_policy<Derived>& policy,
+                   InputIt                    first,
+                   InputIt                    last,
+                   OutputIt                   result)
+{
+  return cuda_cub::uninitialized_copy_n(policy,
+                                        first,
+                                        thrust::distance(first, last),
+                                        result);
+}
+
+}    // namespace cuda_
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/uninitialized_fill.h b/thrust/system/cuda/detail/uninitialized_fill.h
index c6ae90664..96b970201 100644
--- a/thrust/system/cuda/detail/uninitialized_fill.h
+++ b/thrust/system/cuda/detail/uninitialized_fill.h
@@ -1,22 +1,109 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+#include <iterator>
+#include <thrust/distance.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system/cuda/detail/util.h>
+#include <thrust/system/cuda/detail/parallel_for.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace cuda_cub {
+
+namespace __uninitialized_fill {
+
+  template <class Iterator, class T>
+  struct functor
+  {
+    Iterator  items;
+    T         value;
+
+    typedef typename iterator_traits<Iterator>::value_type value_type;
+
+    THRUST_FUNCTION
+    functor(Iterator items_, T const& value_)
+        : items(items_), value(value_) {}
+
+    template<class Size>
+    void THRUST_DEVICE_FUNCTION operator()(Size idx)
+    {
+      value_type& out = raw_reference_cast(items[idx]);
+
+#if defined(__CUDA__) && defined(__clang__)
+      // XXX unsafe. cuda-clang is seemingly unable to call ::new in device code
+      out = value;
+#else
+      ::new (static_cast<void *>(&out)) value_type(value);
+#endif
+    }
+  };    // struct functor
+
+}    // namespace __uninitialized_copy
+
+template <class Derived,
+          class Iterator,
+          class Size,
+          class T>
+Iterator __host__ __device__
+uninitialized_fill_n(execution_policy<Derived>& policy,
+                     Iterator                   first,
+                     Size                       count,
+                     T const&                   x)
+{
+  typedef __uninitialized_fill::functor<Iterator,T> functor_t;
+
+  cuda_cub::parallel_for(policy,
+                         functor_t(first, x),
+                         count);
+
+  return first + count;
+}
+
+template <class Derived,
+          class Iterator,
+          class T>
+void __host__ __device__
+uninitialized_fill(execution_policy<Derived>& policy,
+                   Iterator                   first,
+                   Iterator                   last,
+                   T const&                   x)
+{
+  cuda_cub::uninitialized_fill_n(policy,
+                              first,
+                              thrust::distance(first, last),
+                              x);
+}
+
+}    // namespace cuda_cub
 
+THRUST_NAMESPACE_END
+#endif
diff --git a/thrust/system/cuda/detail/unique.h b/thrust/system/cuda/detail/unique.h
index c6ae90664..653ffa79a 100644
--- a/thrust/system/cuda/detail/unique.h
+++ b/thrust/system/cuda/detail/unique.h
@@ -1,22 +1,821 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/advance.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__ ForwardIterator
+unique(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator                                             first,
+    ForwardIterator                                             last,
+    BinaryPredicate                                             binary_pred);
+
+template <typename DerivedPolicy,
+          typename InputIterator,
+          typename OutputIterator,
+          typename BinaryPredicate>
+__host__ __device__ OutputIterator
+unique_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator                                               first,
+    InputIterator                                               last,
+    OutputIterator                                              result,
+    BinaryPredicate                                             binary_pred);
+
+template <typename DerivedPolicy,
+          typename ForwardIterator,
+          typename BinaryPredicate>
+__host__ __device__ typename thrust::iterator_traits<ForwardIterator>::difference_type
+unique_count(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator                                             first,
+    ForwardIterator                                             last,
+    BinaryPredicate                                             binary_pred);
+
+namespace cuda_cub {
+
+// XXX  it should be possible to unify unique & unique_by_key into a single
+//      agent with various specializations, similar to what is done
+//      with partition
+namespace __unique {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, int NOMINAL_4B_ITEMS_PER_THREAD>
+  struct items_per_thread
+  {
+    enum
+    {
+      value = mpl::min<
+          int,
+          NOMINAL_4B_ITEMS_PER_THREAD,
+          mpl::max<int,
+                   1,
+                   static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                    sizeof(T))>::value>::value
+    };
+  };
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<64,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm52
+
+
+  template <class T>
+  struct Tuning<sm35, T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm35
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm30
+
+  template <class ItemsIt,
+            class ItemsOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  struct UniqueAgent
+  {
+    typedef typename iterator_traits<ItemsIt>::value_type item_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, item_type>::type
+    {
+      typedef Tuning<Arch, item_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, ItemsIt>::type ItemsLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, ItemsLoadIt>::type BlockLoadItems;
+
+      typedef cub::BlockDiscontinuity<item_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityItems;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      typedef core::uninitialized_array<item_type, PtxPlan::ITEMS_PER_TILE>
+          shared_items_t;
+
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage               scan;
+          typename TilePrefixCallback::TempStorage      prefix;
+          typename BlockDiscontinuityItems::TempStorage discontinuity;
+        } scan_storage;
+
+        typename BlockLoadItems::TempStorage  load_items;
+        shared_items_t shared_items;
+
+      };    // union TempStorage
+    };      // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::ItemsLoadIt             ItemsLoadIt;
+    typedef typename ptx_plan::BlockLoadItems          BlockLoadItems;
+    typedef typename ptx_plan::BlockDiscontinuityItems BlockDiscontinuityItems;
+    typedef typename ptx_plan::TilePrefixCallback      TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan               BlockScan;
+    typedef typename ptx_plan::shared_items_t          shared_items_t;
+    typedef typename ptx_plan::TempStorage             TempStorage;
+
+    enum
+    {
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      temp_storage;
+      ScanTileState &                    tile_state;
+      ItemsLoadIt                        items_in;
+      ItemsOutputIt                      items_out;
+      cub::InequalityWrapper<BinaryPred> predicate;
+      Size                               num_items;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      shared_items_t &get_shared()
+      {
+        return temp_storage.shared_items;
+      }
+
+      void THRUST_DEVICE_FUNCTION
+      scatter(item_type (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  /*num_tile_items*/,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size /*num_selections*/)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            get_shared()[local_scatter_offset] = items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          items_out[num_selections_prefix + item] = get_shared()[item];
+        }
+
+        sync_threadblock();
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        using core::sync_threadblock;
+        using core::uninitialized_array;
+
+        item_type items_loc[ITEMS_PER_THREAD];
+        Size      selection_flags[ITEMS_PER_THREAD];
+        Size      selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_in + tile_base,
+                    items_loc,
+                    num_tile_items,
+                    *(items_in + tile_base));
+        }
+        else
+        {
+          BlockLoadItems(temp_storage.load_items)
+              .Load(items_in + tile_base, items_loc);
+        }
+
+
+        sync_threadblock();
+
+        if (IS_FIRST_TILE)
+        {
+          BlockDiscontinuityItems(temp_storage.scan_storage.discontinuity)
+              .FlagHeads(selection_flags, items_loc, predicate);
+        }
+        else
+        {
+          item_type tile_predecessor = items_in[tile_base - 1];
+          BlockDiscontinuityItems(temp_storage.scan_storage.discontinuity)
+              .FlagHeads(selection_flags, items_loc, predicate, tile_predecessor);
+        }
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set selection_flags for out-of-bounds items
+          if ((IS_LAST_TILE) &&
+              (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+            selection_flags[ITEM] = 1;
+        }
+
+        sync_threadblock();
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.scan_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        sync_threadblock();
+
+        scatter(items_loc,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+      }
+
+
+      template <bool IS_LAST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           ItemsLoadIt      items_in_,
+           ItemsOutputIt    items_out_,
+           BinaryPred       binary_pred_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            items_in(items_in_),
+            items_out(items_out_),
+            predicate(binary_pred_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ItemsIt          items_in,
+                       ItemsOutputIt    items_out,
+                       BinaryPred       binary_pred,
+                       NumSelectedOutIt num_selected_out,
+                       Size             num_items,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), items_in),
+           items_out,
+           binary_pred,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  };    // struct UniqueAgent
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+  template <class ItemsInputIt,
+            class ItemsOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            ItemsInputIt     items_in,
+            ItemsOutputIt    items_out,
+            BinaryPred       binary_pred,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        UniqueAgent<ItemsInputIt,
+                    ItemsOutputIt,
+                    BinaryPred,
+                    Size,
+                    NumSelectedOutIt> >
+        unique_agent;
+
+    typedef typename unique_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type   init_plan   = init_agent::get_plan();
+    typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
+
+
+    int tile_size = unique_plan.items_per_tile;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+    size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    size_t      allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    //
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    num_tiles = max<size_t>(1,num_tiles);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    if (num_items == 0) { return status; }
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent");
+    ua.launch(items_in,
+              items_out,
+              binary_pred,
+              num_selected_out,
+              num_items,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename ItemsInputIt,
+            typename ItemsOutputIt,
+            typename BinaryPred>
+  THRUST_RUNTIME_FUNCTION
+  ItemsOutputIt unique(execution_policy<Derived>& policy,
+                       ItemsInputIt               items_first,
+                       ItemsInputIt               items_last,
+                       ItemsOutputIt              items_result,
+                       BinaryPred                 binary_pred)
+  {
+    //  typedef typename iterator_traits<ItemsInputIt>::difference_type size_type;
+    typedef int size_type;
+
+    size_type    num_items          = static_cast<size_type>(thrust::distance(items_first, items_last));
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = doit_step(NULL,
+                       temp_storage_bytes,
+                       items_first,
+                       items_result,
+                       binary_pred,
+                       reinterpret_cast<size_type*>(NULL),
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "unique: failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique: failed on 1st step");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = doit_step(allocations[1],
+                       temp_storage_bytes,
+                       items_first,
+                       items_result,
+                       binary_pred,
+                       d_num_selected_out,
+                       num_items,
+                       stream);
+    cuda_cub::throw_on_error(status, "unique: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "unique: failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return items_result + num_selected;
+  }
+}    // namespace __unique
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class InputIt,
+          class OutputIt,
+          class BinaryPred>
+OutputIt __host__ __device__
+unique_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result,
+            BinaryPred                 binary_pred)
+{
+  THRUST_CDP_DISPATCH(
+    (result = __unique::unique(policy, first, last, result, binary_pred);),
+    (result = thrust::unique_copy(cvt_to_seq(derived_cast(policy)),
+                                  first,
+                                  last,
+                                  result,
+                                  binary_pred);));
+  return result;
+}
+
+template <class Derived,
+          class InputIt,
+          class OutputIt>
+OutputIt __host__ __device__
+unique_copy(execution_policy<Derived> &policy,
+            InputIt                    first,
+            InputIt                    last,
+            OutputIt                   result)
+{
+  typedef typename iterator_traits<InputIt>::value_type input_type;
+  return cuda_cub::unique_copy(policy, first, last, result, equal_to<input_type>());
+}
+
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ForwardIt,
+          class BinaryPred>
+ForwardIt __host__ __device__
+unique(execution_policy<Derived> &policy,
+       ForwardIt                  first,
+       ForwardIt                  last,
+       BinaryPred                 binary_pred)
+{
+  ForwardIt ret = first;
+  THRUST_CDP_DISPATCH(
+    (ret = cuda_cub::unique_copy(policy, first, last, first, binary_pred);),
+    (ret = thrust::unique(cvt_to_seq(derived_cast(policy)),
+                          first,
+                          last,
+                          binary_pred);));
+  return ret;
+}
+
+template <class Derived,
+          class ForwardIt>
+ForwardIt __host__ __device__
+unique(execution_policy<Derived> &policy,
+       ForwardIt                  first,
+       ForwardIt                  last)
+{
+  typedef typename iterator_traits<ForwardIt>::value_type input_type;
+  return cuda_cub::unique(policy, first, last, equal_to<input_type>());
+}
+
+
+template <typename BinaryPred>
+struct zip_adj_not_predicate {
+  template <typename TupleType>
+  bool __host__ __device__ operator()(TupleType&& tuple) {
+      return !binary_pred(thrust::get<0>(tuple), thrust::get<1>(tuple));
+  }
+  
+  BinaryPred binary_pred;
+};
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class ForwardIt,
+          class BinaryPred>
+typename thrust::iterator_traits<ForwardIt>::difference_type
+__host__ __device__
+unique_count(execution_policy<Derived> &policy,
+       ForwardIt                  first,
+       ForwardIt                  last,
+       BinaryPred                 binary_pred)
+{
+  if (first == last) {
+    return 0;
+  }
+  auto size = thrust::distance(first, last);
+  auto it = thrust::make_zip_iterator(thrust::make_tuple(first, thrust::next(first)));
+  return 1 + thrust::count_if(policy, it, thrust::next(it, size - 1), zip_adj_not_predicate<BinaryPred>{binary_pred});
+}
+
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
 
+//
+#include <thrust/memory.h>
+#include <thrust/unique.h>
+#endif
diff --git a/thrust/system/cuda/detail/unique_by_key.h b/thrust/system/cuda/detail/unique_by_key.h
index c6ae90664..d5ce8e786 100644
--- a/thrust/system/cuda/detail/unique_by_key.h
+++ b/thrust/system/cuda/detail/unique_by_key.h
@@ -1,22 +1,919 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
  *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
  *
- *      http://www.apache.org/licenses/LICENSE-2.0
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+ ******************************************************************************/
 #pragma once
 
 #include <thrust/detail/config.h>
 
-// this system has no special version of this algorithm 
+#if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC
+
+#include <thrust/detail/alignment.h>
+#include <thrust/detail/cstdint.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/detail/minmax.h>
+#include <thrust/detail/mpl/math.h>
+#include <thrust/distance.h>
+#include <thrust/functional.h>
+#include <thrust/pair.h>
+#include <thrust/system/cuda/config.h>
+#include <thrust/system/cuda/detail/cdp_dispatch.h>
+#include <thrust/system/cuda/detail/core/agent_launcher.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/cuda/detail/par_to_seq.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <cub/device/device_select.cuh>
+#include <cub/util_math.cuh>
+
+THRUST_NAMESPACE_BEGIN
+
+template <typename DerivedPolicy,
+          typename ForwardIterator1,
+          typename ForwardIterator2>
+__host__ __device__ thrust::pair<ForwardIterator1, ForwardIterator2>
+unique_by_key(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    ForwardIterator1                                            keys_first,
+    ForwardIterator1                                            keys_last,
+    ForwardIterator2                                            values_first);
+template <typename DerivedPolicy,
+          typename InputIterator1,
+          typename InputIterator2,
+          typename OutputIterator1,
+          typename OutputIterator2>
+__host__ __device__ thrust::pair<OutputIterator1, OutputIterator2>
+unique_by_key_copy(
+    const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+    InputIterator1                                              keys_first,
+    InputIterator1                                              keys_last,
+    InputIterator2                                              values_first,
+    OutputIterator1                                             keys_result,
+    OutputIterator2                                             values_result);
+
+
+namespace cuda_cub {
+
+// XXX  it should be possible to unify unique & unique_by_key into a single
+//      agent with various specializations, similar to what is done
+//      with partition
+namespace __unique_by_key {
+
+  template <int                     _BLOCK_THREADS,
+            int                     _ITEMS_PER_THREAD = 1,
+            cub::BlockLoadAlgorithm _LOAD_ALGORITHM   = cub::BLOCK_LOAD_DIRECT,
+            cub::CacheLoadModifier  _LOAD_MODIFIER    = cub::LOAD_LDG,
+            cub::BlockScanAlgorithm _SCAN_ALGORITHM   = cub::BLOCK_SCAN_WARP_SCANS>
+  struct PtxPolicy
+  {
+    enum
+    {
+      BLOCK_THREADS    = _BLOCK_THREADS,
+      ITEMS_PER_THREAD = _ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = _BLOCK_THREADS * _ITEMS_PER_THREAD,
+    };
+    static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM;
+    static const cub::CacheLoadModifier  LOAD_MODIFIER  = _LOAD_MODIFIER;
+    static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM;
+  };    // struct PtxPolicy
+
+  template<class,class>
+  struct Tuning;
+
+  namespace mpl = thrust::detail::mpl::math;
+
+  template<class T, size_t NOMINAL_4B_ITEMS_PER_THREAD>
+  struct items_per_thread
+  {
+    enum
+    {
+      value = mpl::min<
+          int,
+          static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD),
+          mpl::max<int,
+                   1,
+                   static_cast<int>(NOMINAL_4B_ITEMS_PER_THREAD * 4 /
+                   sizeof(T))>::value>::value
+    };
+  };
+
+
+  template<class T>
+  struct Tuning<sm52,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 11,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<64,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm52
+
+  template<class T>
+  struct Tuning<sm35,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 9,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_LDG,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm35
+
+  template<class T>
+  struct Tuning<sm30,T>
+  {
+    const static int INPUT_SIZE = sizeof(T);
+    enum
+    {
+      NOMINAL_4B_ITEMS_PER_THREAD = 7,
+      //
+      ITEMS_PER_THREAD = items_per_thread<T,
+                                          NOMINAL_4B_ITEMS_PER_THREAD>::value
+    };
+
+    typedef PtxPolicy<128,
+                      ITEMS_PER_THREAD,
+                      cub::BLOCK_LOAD_WARP_TRANSPOSE,
+                      cub::LOAD_DEFAULT,
+                      cub::BLOCK_SCAN_WARP_SCANS>
+        type;
+  };    // Tuning for sm30
+
+  template <class KeyInputIt,
+            class ValInputIt,
+            class KeyOutputIt,
+            class ValOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  struct UniqueByKeyAgent
+  {
+    typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+    typedef typename iterator_traits<ValInputIt>::value_type value_type;
+
+    typedef cub::ScanTileState<Size> ScanTileState;
+
+    template <class Arch>
+    struct PtxPlan : Tuning<Arch, key_type>::type
+    {
+      typedef Tuning<Arch, key_type> tuning;
+
+      typedef typename core::LoadIterator<PtxPlan, KeyInputIt>::type KeyLoadIt;
+      typedef typename core::LoadIterator<PtxPlan, ValInputIt>::type ValLoadIt;
+
+      typedef typename core::BlockLoad<PtxPlan, KeyLoadIt>::type BlockLoadKeys;
+      typedef typename core::BlockLoad<PtxPlan, ValLoadIt>::type BlockLoadValues;
+
+      typedef cub::BlockDiscontinuity<key_type,
+                                      PtxPlan::BLOCK_THREADS,
+                                      1,
+                                      1,
+                                      Arch::ver>
+          BlockDiscontinuityKeys;
+
+      typedef cub::TilePrefixCallbackOp<Size,
+                                        cub::Sum,
+                                        ScanTileState,
+                                        Arch::ver>
+          TilePrefixCallback;
+      typedef cub::BlockScan<Size,
+                             PtxPlan::BLOCK_THREADS,
+                             PtxPlan::SCAN_ALGORITHM,
+                             1,
+                             1,
+                             Arch::ver>
+          BlockScan;
+
+      typedef core::uninitialized_array<key_type, PtxPlan::ITEMS_PER_TILE>
+          shared_keys_t;
+      typedef core::uninitialized_array<value_type, PtxPlan::ITEMS_PER_TILE>
+          shared_values_t;
+
+      union TempStorage
+      {
+        struct ScanStorage
+        {
+          typename BlockScan::TempStorage              scan;
+          typename TilePrefixCallback::TempStorage     prefix;
+          typename BlockDiscontinuityKeys::TempStorage discontinuity;
+        } scan_storage;
+
+        typename BlockLoadKeys::TempStorage   load_keys;
+        typename BlockLoadValues::TempStorage load_values;
+
+        shared_keys_t   shared_keys;
+        shared_values_t shared_values;
+      };    // union TempStorage
+    };    // struct PtxPlan
+
+    typedef typename core::specialize_plan_msvc10_war<PtxPlan>::type::type ptx_plan;
+
+    typedef typename ptx_plan::KeyLoadIt              KeyLoadIt;
+    typedef typename ptx_plan::ValLoadIt              ValLoadIt;
+    typedef typename ptx_plan::BlockLoadKeys          BlockLoadKeys;
+    typedef typename ptx_plan::BlockLoadValues        BlockLoadValues;
+    typedef typename ptx_plan::BlockDiscontinuityKeys BlockDiscontinuityKeys;
+    typedef typename ptx_plan::TilePrefixCallback     TilePrefixCallback;
+    typedef typename ptx_plan::BlockScan              BlockScan;
+    typedef typename ptx_plan::TempStorage            TempStorage;
+    typedef typename ptx_plan::shared_keys_t          shared_keys_t;
+    typedef typename ptx_plan::shared_values_t        shared_values_t;
+
+    enum
+    {
+      BLOCK_THREADS    = ptx_plan::BLOCK_THREADS,
+      ITEMS_PER_THREAD = ptx_plan::ITEMS_PER_THREAD,
+      ITEMS_PER_TILE   = ptx_plan::ITEMS_PER_TILE
+    };
+
+    struct impl
+    {
+      //---------------------------------------------------------------------
+      // Per-thread fields
+      //---------------------------------------------------------------------
+
+      TempStorage &                      temp_storage;
+      ScanTileState &                    tile_state;
+      KeyLoadIt                          keys_in;
+      ValLoadIt                          values_in;
+      KeyOutputIt                        keys_out;
+      ValOutputIt                        values_out;
+      cub::InequalityWrapper<BinaryPred> predicate;
+      Size                               num_items;
+
+      //---------------------------------------------------------------------
+      // Utility functions
+      //---------------------------------------------------------------------
+
+      struct key_tag {};
+      struct value_tag {};
+
+      THRUST_DEVICE_FUNCTION
+      shared_keys_t &get_shared(key_tag)
+      {
+        return temp_storage.shared_keys;
+      }
+      THRUST_DEVICE_FUNCTION
+      shared_values_t &get_shared(value_tag)
+      {
+        return temp_storage.shared_values;
+      }
+
+
+      template <class Tag,
+                class OutputIt,
+                class T>
+      void THRUST_DEVICE_FUNCTION
+      scatter(Tag      tag,
+              OutputIt items_out,
+              T (&items)[ITEMS_PER_THREAD],
+              Size (&selection_flags)[ITEMS_PER_THREAD],
+              Size (&selection_indices)[ITEMS_PER_THREAD],
+              int  /*num_tile_items*/,
+              int  num_tile_selections,
+              Size num_selections_prefix,
+              Size /*num_selections*/)
+      {
+        using core::sync_threadblock;
+
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          int local_scatter_offset = selection_indices[ITEM] -
+                                     num_selections_prefix;
+          if (selection_flags[ITEM])
+          {
+            get_shared(tag)[local_scatter_offset] = items[ITEM];
+          }
+        }
+
+        sync_threadblock();
+
+        for (int item = threadIdx.x;
+             item < num_tile_selections;
+             item += BLOCK_THREADS)
+        {
+          items_out[num_selections_prefix + item] = get_shared(tag)[item];
+        }
+
+        sync_threadblock();
+      }
+
+      //---------------------------------------------------------------------
+      // Tile processing
+      //---------------------------------------------------------------------
+
+      template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile_impl(int  num_tile_items,
+                        int  tile_idx,
+                        Size tile_base)
+      {
+        using core::sync_threadblock;
+
+        key_type keys[ITEMS_PER_THREAD];
+        Size     selection_flags[ITEMS_PER_THREAD];
+        Size     selection_idx[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadKeys(temp_storage.load_keys)
+              .Load(keys_in + tile_base,
+                    keys,
+                    num_tile_items,
+                    *(keys_in + tile_base));
+        }
+        else
+        {
+          BlockLoadKeys(temp_storage.load_keys).Load(keys_in + tile_base, keys);
+        }
+
+
+        sync_threadblock();
+
+        value_type values[ITEMS_PER_THREAD];
+        if (IS_LAST_TILE)
+        {
+          // Fill last elements with the first element
+          // because collectives are not suffix guarded
+          BlockLoadValues(temp_storage.load_values)
+              .Load(values_in + tile_base,
+                    values,
+                    num_tile_items,
+                    *(values_in + tile_base));
+        }
+        else
+        {
+          BlockLoadValues(temp_storage.load_values)
+              .Load(values_in + tile_base, values);
+        }
+
+        sync_threadblock();
+
+        if (IS_FIRST_TILE)
+        {
+          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+              .FlagHeads(selection_flags, keys, predicate);
+        }
+        else
+        {
+          key_type tile_predecessor = keys_in[tile_base - 1];
+          BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity)
+              .FlagHeads(selection_flags, keys, predicate, tile_predecessor);
+        }
+#pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+          // Set selection_flags for out-of-bounds items
+          if ((IS_LAST_TILE) && (Size(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+            selection_flags[ITEM] = 1;
+        }
+
+        sync_threadblock();
+
+
+        Size num_tile_selections   = 0;
+        Size num_selections        = 0;
+        Size num_selections_prefix = 0;
+        if (IS_FIRST_TILE)
+        {
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            num_tile_selections);
+
+          if (threadIdx.x == 0)
+          {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+              tile_state.SetInclusive(0, num_tile_selections);
+          }
+
+          // Do not count any out-of-bounds selections
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+          }
+          num_selections = num_tile_selections;
+        }
+        else
+        {
+          TilePrefixCallback prefix_cb(tile_state,
+                                       temp_storage.scan_storage.prefix,
+                                       cub::Sum(),
+                                       tile_idx);
+          BlockScan(temp_storage.scan_storage.scan)
+              .ExclusiveSum(selection_flags,
+                            selection_idx,
+                            prefix_cb);
+
+          num_selections        = prefix_cb.GetInclusivePrefix();
+          num_tile_selections   = prefix_cb.GetBlockAggregate();
+          num_selections_prefix = prefix_cb.GetExclusivePrefix();
+
+          if (IS_LAST_TILE)
+          {
+            int num_discount = ITEMS_PER_TILE - num_tile_items;
+            num_tile_selections -= num_discount;
+            num_selections -= num_discount;
+          }
+        }
+
+        sync_threadblock();
+
+        scatter(key_tag(),
+                keys_out,
+                keys,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        sync_threadblock();
+
+        scatter(value_tag(),
+                values_out,
+                values,
+                selection_flags,
+                selection_idx,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_selections);
+
+        return num_selections;
+      }
+
+
+      template <bool IS_LAST_TILE>
+      Size THRUST_DEVICE_FUNCTION
+      consume_tile(int  num_tile_items,
+                   int  tile_idx,
+                   Size tile_base)
+      {
+        if (tile_idx == 0)
+        {
+          return consume_tile_impl<IS_LAST_TILE, true>(num_tile_items,
+                                                       tile_idx,
+                                                       tile_base);
+        }
+        else
+        {
+          return consume_tile_impl<IS_LAST_TILE, false>(num_tile_items,
+                                                        tile_idx,
+                                                        tile_base);
+        }
+      }
+
+      //---------------------------------------------------------------------
+      // Constructor
+      //---------------------------------------------------------------------
+
+      THRUST_DEVICE_FUNCTION
+      impl(TempStorage &    temp_storage_,
+           ScanTileState &  tile_state_,
+           KeyLoadIt        keys_in_,
+           ValLoadIt        values_in_,
+           KeyOutputIt      keys_out_,
+           ValOutputIt      values_out_,
+           BinaryPred       binary_pred_,
+           Size             num_items_,
+           int              num_tiles,
+           NumSelectedOutIt num_selected_out)
+          // filed ctors
+          : temp_storage(temp_storage_),
+            tile_state(tile_state_),
+            keys_in(keys_in_),
+            values_in(values_in_),
+            keys_out(keys_out_),
+            values_out(values_out_),
+            predicate(binary_pred_),
+            num_items(num_items_)
+      {
+        int  tile_idx  = blockIdx.x;
+        Size tile_base = tile_idx * ITEMS_PER_TILE;
+
+        if (tile_idx < num_tiles - 1)
+        {
+          consume_tile<false>(ITEMS_PER_TILE,
+                              tile_idx,
+                              tile_base);
+        }
+        else
+        {
+          int  num_remaining  = static_cast<int>(num_items - tile_base);
+          Size num_selections = consume_tile<true>(num_remaining,
+                                                   tile_idx,
+                                                   tile_base);
+          if (threadIdx.x == 0)
+          {
+            *num_selected_out = num_selections;
+          }
+        }
+      }
+    };    // struct impl
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(KeyInputIt       keys_in,
+                       ValInputIt       values_in,
+                       KeyOutputIt      keys_out,
+                       ValOutputIt      values_out,
+                       BinaryPred       binary_pred,
+                       NumSelectedOutIt num_selected_out,
+                       Size             num_items,
+                       ScanTileState    tile_state,
+                       int              num_tiles,
+                       char *           shmem)
+    {
+      TempStorage &storage = *reinterpret_cast<TempStorage *>(shmem);
+
+      impl(storage,
+           tile_state,
+           core::make_load_iterator(ptx_plan(), keys_in),
+           core::make_load_iterator(ptx_plan(), values_in),
+           keys_out,
+           values_out,
+           binary_pred,
+           num_items,
+           num_tiles,
+           num_selected_out);
+    }
+  }; // struct UniqueByKeyAgent
+
+
+  template <class ScanTileState,
+            class NumSelectedIt,
+            class Size>
+  struct InitAgent
+  {
+    template <class Arch>
+    struct PtxPlan : PtxPolicy<128> {};
+
+    typedef core::specialize_plan<PtxPlan> ptx_plan;
+
+    //---------------------------------------------------------------------
+    // Agent entry point
+    //---------------------------------------------------------------------
+
+    THRUST_AGENT_ENTRY(ScanTileState tile_state,
+                       Size          num_tiles,
+                       NumSelectedIt num_selected_out,
+                       char * /*shmem*/)
+    {
+      tile_state.InitializeStatus(num_tiles);
+      if (blockIdx.x == 0 && threadIdx.x == 0)
+        *num_selected_out = 0;
+    }
+
+  }; // struct InitAgent
+
+
+  template <class KeyInputIt,
+            class ValInputIt,
+            class KeyOutputIt,
+            class ValOutputIt,
+            class BinaryPred,
+            class Size,
+            class NumSelectedOutIt>
+  static cudaError_t THRUST_RUNTIME_FUNCTION
+  doit_step(void *           d_temp_storage,
+            size_t &         temp_storage_bytes,
+            KeyInputIt       keys_in,
+            ValInputIt       values_in,
+            KeyOutputIt      keys_out,
+            ValOutputIt      values_out,
+            BinaryPred       binary_pred,
+            NumSelectedOutIt num_selected_out,
+            Size             num_items,
+            cudaStream_t     stream)
+  {
+    using core::AgentLauncher;
+    using core::AgentPlan;
+    using core::get_agent_plan;
+
+    typedef AgentLauncher<
+        UniqueByKeyAgent<KeyInputIt,
+                         ValInputIt,
+                         KeyOutputIt,
+                         ValOutputIt,
+                         BinaryPred,
+                         Size,
+                         NumSelectedOutIt> >
+        unique_agent;
+
+    typedef typename unique_agent::ScanTileState ScanTileState;
+
+    typedef AgentLauncher<
+        InitAgent<ScanTileState, NumSelectedOutIt, Size> >
+        init_agent;
+
+    using core::get_plan;
+    typename get_plan<init_agent>::type   init_plan   = init_agent::get_plan();
+    typename get_plan<unique_agent>::type unique_plan = unique_agent::get_plan(stream);
+
+
+    int tile_size = unique_plan.items_per_tile;
+    size_t num_tiles = cub::DivideAndRoundUp(num_items, tile_size);
+
+    size_t vshmem_size = core::vshmem_size(unique_plan.shared_memory_size,
+                                           num_tiles);
+
+    cudaError_t status = cudaSuccess;
+    size_t      allocation_sizes[2] = {0, vshmem_size};
+    status = ScanTileState::AllocationSize(static_cast<int>(num_tiles), allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    void *allocations[2] = {NULL, NULL};
+    //
+    status = cub::AliasTemporaries(d_temp_storage,
+                                   temp_storage_bytes,
+                                   allocations,
+                                   allocation_sizes);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    if (d_temp_storage == NULL)
+    {
+      return status;
+    }
+
+    ScanTileState tile_status;
+    status =  tile_status.Init(static_cast<int>(num_tiles), allocations[0], allocation_sizes[0]);
+    CUDA_CUB_RET_IF_FAIL(status);
+
+    num_tiles = max<size_t>(1,num_tiles);
+    init_agent ia(init_plan, num_tiles, stream, "unique_by_key::init_agent");
+    ia.launch(tile_status, num_tiles, num_selected_out);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+
+    if (num_items == 0) { return status; }
+
+    char *vshmem_ptr = vshmem_size > 0 ? (char *)allocations[1] : NULL;
+
+    unique_agent ua(unique_plan, num_items, stream, vshmem_ptr, "unique_by_key::unique_agent");
+    ua.launch(keys_in,
+              values_in,
+              keys_out,
+              values_out,
+              binary_pred,
+              num_selected_out,
+              num_items,
+              tile_status,
+              num_tiles);
+    CUDA_CUB_RET_IF_FAIL(cudaPeekAtLastError());
+    return status;
+  }
+
+  template <typename Derived,
+            typename KeyInputIt,
+            typename ValInputIt,
+            typename KeyOutputIt,
+            typename ValOutputIt,
+            typename BinaryPred>
+  THRUST_RUNTIME_FUNCTION
+  pair<KeyOutputIt, ValOutputIt>
+  unique_by_key(execution_policy<Derived>& policy,
+                KeyInputIt                 keys_first,
+                KeyInputIt                 keys_last,
+                ValInputIt                 values_first,
+                KeyOutputIt                keys_result,
+                ValOutputIt                values_result,
+                BinaryPred                 binary_pred)
+  {
+
+    typedef int size_type;
+
+    size_type num_items
+      = static_cast<size_type>(thrust::distance(keys_first, keys_last));
+
+    size_t       temp_storage_bytes = 0;
+    cudaStream_t stream             = cuda_cub::stream(policy);
+
+    cudaError_t status;
+    status = __unique_by_key::doit_step(NULL,
+                                        temp_storage_bytes,
+                                        keys_first,
+                                        values_first,
+                                        keys_result,
+                                        values_result,
+                                        binary_pred,
+                                        reinterpret_cast<size_type*>(NULL),
+                                        num_items,
+                                        stream);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step");
+
+    size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes};
+    void * allocations[2]      = {NULL, NULL};
+
+    size_t storage_size = 0;
+    status = core::alias_storage(NULL,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage");
+
+    // Allocate temporary storage.
+    thrust::detail::temporary_array<thrust::detail::uint8_t, Derived>
+      tmp(policy, storage_size);
+    void *ptr = static_cast<void*>(tmp.data().get());
+
+    status = core::alias_storage(ptr,
+                                 storage_size,
+                                 allocations,
+                                 allocation_sizes);
+    cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage");
+
+    size_type* d_num_selected_out
+      = thrust::detail::aligned_reinterpret_cast<size_type*>(allocations[0]);
+
+    status = __unique_by_key::doit_step(allocations[1],
+                                        temp_storage_bytes,
+                                        keys_first,
+                                        values_first,
+                                        keys_result,
+                                        values_result,
+                                        binary_pred,
+                                        d_num_selected_out,
+                                        num_items,
+                                        stream);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step");
+
+    status = cuda_cub::synchronize(policy);
+    cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize");
+
+    size_type num_selected = get_value(policy, d_num_selected_out);
+
+    return thrust::make_pair(
+      keys_result + num_selected,
+      values_result + num_selected
+    );
+  }
+
+} // namespace __unique_by_key
+
+
+//-------------------------
+// Thrust API entry points
+//-------------------------
+
+
+__thrust_exec_check_disable__
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt,
+          class BinaryPred>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+unique_by_key_copy(execution_policy<Derived> &policy,
+                   KeyInputIt                 keys_first,
+                   KeyInputIt                 keys_last,
+                   ValInputIt                 values_first,
+                   KeyOutputIt                keys_result,
+                   ValOutputIt                values_result,
+                   BinaryPred                 binary_pred)
+{
+  auto ret = thrust::make_pair(keys_result, values_result);
+  THRUST_CDP_DISPATCH(
+    (ret = __unique_by_key::unique_by_key(policy,
+                                          keys_first,
+                                          keys_last,
+                                          values_first,
+                                          keys_result,
+                                          values_result,
+                                          binary_pred);),
+    (ret = thrust::unique_by_key_copy(cvt_to_seq(derived_cast(policy)),
+                                      keys_first,
+                                      keys_last,
+                                      values_first,
+                                      keys_result,
+                                      values_result,
+                                      binary_pred);));
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class KeyOutputIt,
+          class ValOutputIt>
+pair<KeyOutputIt, ValOutputIt> __host__ __device__
+unique_by_key_copy(execution_policy<Derived> &policy,
+                   KeyInputIt                 keys_first,
+                   KeyInputIt                 keys_last,
+                   ValInputIt                 values_first,
+                   KeyOutputIt                keys_result,
+                   ValOutputIt                values_result)
+{
+  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::unique_by_key_copy(policy,
+                                   keys_first,
+                                   keys_last,
+                                   values_first,
+                                   keys_result,
+                                   values_result,
+                                   equal_to<key_type>());
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt,
+          class BinaryPred>
+pair<KeyInputIt, ValInputIt> __host__ __device__
+unique_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first,
+              BinaryPred                 binary_pred)
+{
+  auto ret = thrust::make_pair(keys_first, values_first);
+  THRUST_CDP_DISPATCH(
+    (ret = cuda_cub::unique_by_key_copy(policy,
+                                         keys_first,
+                                         keys_last,
+                                         values_first,
+                                         keys_first,
+                                         values_first,
+                                         binary_pred);),
+    (ret = thrust::unique_by_key(cvt_to_seq(derived_cast(policy)),
+                                  keys_first,
+                                  keys_last,
+                                  values_first,
+                                  binary_pred);));
+  return ret;
+}
+
+template <class Derived,
+          class KeyInputIt,
+          class ValInputIt>
+pair<KeyInputIt, ValInputIt> __host__ __device__
+unique_by_key(execution_policy<Derived> &policy,
+              KeyInputIt                 keys_first,
+              KeyInputIt                 keys_last,
+              ValInputIt                 values_first)
+{
+  typedef typename iterator_traits<KeyInputIt>::value_type key_type;
+  return cuda_cub::unique_by_key(policy,
+                              keys_first,
+                              keys_last,
+                              values_first,
+                              equal_to<key_type>());
+}
+
+
+
+}    // namespace cuda_cub
+THRUST_NAMESPACE_END
+
+#include <thrust/memory.h>
+#include <thrust/unique.h>
 
+#endif
diff --git a/thrust/system/cuda/detail/util.h b/thrust/system/cuda/detail/util.h
new file mode 100644
index 000000000..6d9e3681d
--- /dev/null
+++ b/thrust/system/cuda/detail/util.h
@@ -0,0 +1,641 @@
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights meserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+#pragma once
+
+#include <cstdio>
+#include <exception>
+#include <thrust/detail/config.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/system_error.h>
+#include <thrust/system/cuda/error.h>
+
+#include <cub/detail/device_synchronize.cuh>
+#include <cub/util_arch.cuh>
+#include <cub/util_device.cuh>
+
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub {
+
+inline __host__ __device__
+cudaStream_t
+default_stream()
+{
+#ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM
+  return cudaStreamPerThread;
+#else
+  return cudaStreamLegacy;
+#endif
+}
+
+// Fallback implementation of the customization point.
+template <class Derived>
+__host__ __device__
+cudaStream_t
+get_stream(execution_policy<Derived> &)
+{
+  return default_stream();
+}
+
+// Entry point/interface.
+template <class Derived>
+__host__ __device__ cudaStream_t
+stream(execution_policy<Derived> &policy)
+{
+  return get_stream(derived_cast(policy));
+}
+
+
+// Fallback implementation of the customization point.
+template <class Derived>
+__host__ __device__
+bool
+must_perform_optional_stream_synchronization(execution_policy<Derived> &)
+{
+  return true;
+}
+
+// Entry point/interface.
+template <class Derived>
+__host__ __device__ bool
+must_perform_optional_synchronization(execution_policy<Derived> &policy)
+{
+  return must_perform_optional_stream_synchronization(derived_cast(policy));
+}
+
+
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream(execution_policy<Derived> &policy)
+{
+  return cub::SyncStream(stream(policy));
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize(Policy &policy)
+{
+  return synchronize_stream(derived_cast(policy));
+}
+
+// Fallback implementation of the customization point.
+__thrust_exec_check_disable__
+template <class Derived>
+__host__ __device__
+cudaError_t
+synchronize_stream_optional(execution_policy<Derived> &policy)
+{
+  cudaError_t result;
+
+  if (must_perform_optional_synchronization(policy))
+  {
+    result = synchronize_stream(policy);
+  }
+  else
+  {
+    result = cudaSuccess;
+  }
+
+  return result;
+}
+
+// Entry point/interface.
+template <class Policy>
+__host__ __device__
+cudaError_t
+synchronize_optional(Policy &policy)
+{
+  return synchronize_stream_optional(derived_cast(policy));
+}
+
+template <class Type>
+THRUST_HOST_FUNCTION cudaError_t
+trivial_copy_from_device(Type *       dst,
+                         Type const * src,
+                         size_t       count,
+                         cudaStream_t stream)
+{
+  cudaError status = cudaSuccess;
+  if (count == 0) return status;
+
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyDeviceToHost,
+                             stream);
+  cudaStreamSynchronize(stream);
+  return status;
+}
+
+template <class Type>
+THRUST_HOST_FUNCTION cudaError_t
+trivial_copy_to_device(Type *       dst,
+                       Type const * src,
+                       size_t       count,
+                       cudaStream_t stream)
+{
+  cudaError status = cudaSuccess;
+  if (count == 0) return status;
+
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyHostToDevice,
+                             stream);
+  cudaStreamSynchronize(stream);
+  return status;
+}
+
+template <class Policy, class Type>
+__host__ __device__ cudaError_t
+trivial_copy_device_to_device(Policy &    policy,
+                              Type *      dst,
+                              Type const *src,
+                              size_t      count)
+{
+  cudaError_t  status = cudaSuccess;
+  if (count == 0) return status;
+
+  cudaStream_t stream = cuda_cub::stream(policy);
+  //
+  status = ::cudaMemcpyAsync(dst,
+                             src,
+                             sizeof(Type) * count,
+                             cudaMemcpyDeviceToDevice,
+                             stream);
+  cuda_cub::synchronize(policy);
+  return status;
+}
+
+inline void __host__ __device__
+terminate()
+{
+  NV_IF_TARGET(NV_IS_HOST, (std::terminate();), (asm("trap;");));
+}
+
+__host__  __device__
+inline void throw_on_error(cudaError_t status)
+{
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+#ifdef THRUST_RDC_ENABLED
+  cudaGetLastError();
+#else
+  NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();));
+#endif
+
+  if (cudaSuccess != status)
+  {
+
+    // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device
+    // instructions out of the target logic.
+#ifdef THRUST_RDC_ENABLED
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %s: %s\n", \
+         cudaGetErrorName(status), \
+         cudaGetErrorString(status))
+
+#else
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %d\n", \
+         static_cast<int>(status))
+
+#endif
+
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system_error(status, thrust::cuda_category());
+    ), (
+      THRUST_TEMP_DEVICE_CODE;
+      cuda_cub::terminate();
+    ));
+
+#undef THRUST_TEMP_DEVICE_CODE
+
+  }
+}
+
+__host__ __device__
+inline void throw_on_error(cudaError_t status, char const *msg)
+{
+  // Clear the global CUDA error state which may have been set by the last
+  // call. Otherwise, errors may "leak" to unrelated kernel launches.
+#ifdef THRUST_RDC_ENABLED
+  cudaGetLastError();
+#else
+  NV_IF_TARGET(NV_IS_HOST, (cudaGetLastError();));
+#endif
+
+  if (cudaSuccess != status)
+  {
+    // Can't use #if inside NV_IF_TARGET, use a temp macro to hoist the device
+    // instructions out of the target logic.
+#ifdef THRUST_RDC_ENABLED
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %s: %s: %s\n", \
+         cudaGetErrorName(status), \
+         cudaGetErrorString(status),\
+         msg)
+
+#else
+
+#define THRUST_TEMP_DEVICE_CODE \
+  printf("Thrust CUDA backend error: %d: %s\n", \
+         static_cast<int>(status),              \
+         msg)
+
+#endif
+
+    NV_IF_TARGET(NV_IS_HOST, (
+      throw thrust::system_error(status, thrust::cuda_category(), msg);
+    ), (
+      THRUST_TEMP_DEVICE_CODE;
+      cuda_cub::terminate();
+    ));
+
+#undef THRUST_TEMP_DEVICE_CODE
+
+  }
+}
+
+// FIXME: Move the iterators elsewhere.
+
+template <class ValueType,
+          class InputIt,
+          class UnaryOp>
+struct transform_input_iterator_t
+{
+  typedef transform_input_iterator_t                         self_t;
+  typedef typename iterator_traits<InputIt>::difference_type difference_type;
+  typedef ValueType                                          value_type;
+  typedef void                                               pointer;
+  typedef value_type                                         reference;
+  typedef std::random_access_iterator_tag                    iterator_category;
+
+  InputIt         input;
+  mutable UnaryOp op;
+
+  __host__ __device__ __forceinline__
+  transform_input_iterator_t(InputIt input, UnaryOp op)
+      : input(input), op(op) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  transform_input_iterator_t(const self_t &) = default;
+#endif
+
+  // UnaryOp might not be copy assignable, such as when it is a lambda.  Define
+  // an explicit copy assignment operator that doesn't try to assign it.
+  __host__ __device__ 
+  self_t& operator=(const self_t& o)
+  {
+    input = o.input;
+    return *this;
+  }
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++input;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++input;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    typename thrust::iterator_value<InputIt>::type x = *input;
+    return op(x);
+  }
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    typename thrust::iterator_value<InputIt>::type x = *input;
+    return op(x);
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(input + n, op);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    input += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(input - n, op);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    input -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return input - other.input;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return op(input[n]);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (input == rhs.input);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (input != rhs.input);
+  }
+};    // struct transform_input_iterarot_t
+
+template <class ValueType,
+          class InputIt1,
+          class InputIt2,
+          class BinaryOp>
+struct transform_pair_of_input_iterators_t
+{
+  typedef transform_pair_of_input_iterators_t                 self_t;
+  typedef typename iterator_traits<InputIt1>::difference_type difference_type;
+  typedef ValueType                                           value_type;
+  typedef void                                                pointer;
+  typedef value_type                                          reference;
+  typedef std::random_access_iterator_tag                     iterator_category;
+
+  InputIt1         input1;
+  InputIt2         input2;
+  mutable BinaryOp op;
+
+  __host__ __device__ __forceinline__
+  transform_pair_of_input_iterators_t(InputIt1 input1_,
+                                      InputIt2 input2_,
+                                      BinaryOp op_)
+      : input1(input1_), input2(input2_), op(op_) {}
+
+#if THRUST_CPP_DIALECT >= 2011
+  transform_pair_of_input_iterators_t(const self_t &) = default;
+#endif
+
+  // BinaryOp might not be copy assignable, such as when it is a lambda.
+  // Define an explicit copy assignment operator that doesn't try to assign it.
+  __host__ __device__
+  self_t& operator=(const self_t& o)
+  {
+    input1 = o.input1;
+    input2 = o.input2;
+    return *this;
+  }
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++input1;
+    ++input2;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++input1;
+    ++input2;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    return op(*input1, *input2);
+  }
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    return op(*input1, *input2);
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(input1 + n, input2 + n, op);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    input1 += n;
+    input2 += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(input1 - n, input2 - n, op);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    input1 -= n;
+    input2 -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return input1 - other.input1;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return op(input1[n], input2[n]);
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (input1 == rhs.input1) && (input2 == rhs.input2);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (input1 != rhs.input1) || (input2 != rhs.input2);
+  }
+
+};    // struct transform_pair_of_input_iterators_t
+
+
+struct identity
+{
+  template <class T>
+  __host__ __device__ T const &
+  operator()(T const &t) const
+  {
+    return t;
+  }
+
+  template <class T>
+  __host__ __device__ T &
+  operator()(T &t) const
+  {
+    return t;
+  }
+};
+
+
+template <class T>
+struct counting_iterator_t
+{
+  typedef counting_iterator_t             self_t;
+  typedef T                               difference_type;
+  typedef T                               value_type;
+  typedef void                            pointer;
+  typedef T                               reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  T count;
+
+  __host__ __device__ __forceinline__
+  counting_iterator_t(T count_) : count(count_) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_t operator++(int)
+  {
+    self_t retval = *this;
+    ++count;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_t operator++()
+  {
+    ++count;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const
+  {
+    return count;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*()
+  {
+    return count;
+  }
+
+  /// Addition
+  __host__ __device__ __forceinline__ self_t operator+(difference_type n) const
+  {
+    return self_t(count + n);
+  }
+
+  /// Addition assignment
+  __host__ __device__ __forceinline__ self_t &operator+=(difference_type n)
+  {
+    count += n;
+    return *this;
+  }
+
+  /// Subtraction
+  __host__ __device__ __forceinline__ self_t operator-(difference_type n) const
+  {
+    return self_t(count - n);
+  }
+
+  /// Subtraction assignment
+  __host__ __device__ __forceinline__ self_t &operator-=(difference_type n)
+  {
+    count -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type operator-(self_t other) const
+  {
+    return count - other.count;
+  }
+
+  /// Array subscript
+  __host__ __device__ __forceinline__ reference operator[](difference_type n) const
+  {
+    return count + n;
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_t &rhs) const
+  {
+    return (count == rhs.count);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_t &rhs) const
+  {
+    return (count != rhs.count);
+  }
+
+};    // struct count_iterator_t
+
+}    // cuda_
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/cuda/detail/vector.inl b/thrust/system/cuda/detail/vector.inl
deleted file mode 100644
index 761788946..000000000
--- a/thrust/system/cuda/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end cuda
-} // end system
-} // end thrust
-
diff --git a/thrust/system/cuda/error.h b/thrust/system/cuda/error.h
index fea5f2abe..b180f8347 100644
--- a/thrust/system/cuda/error.h
+++ b/thrust/system/cuda/error.h
@@ -26,8 +26,7 @@
 #include <thrust/system/error_code.h>
 #include <thrust/system/cuda/detail/guarded_driver_types.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -35,15 +34,13 @@ namespace system
 namespace cuda
 {
 
-/*! \addtogroup system
- *  \{
- */
-
 // To construct an error_code after a CUDA Runtime error:
 //
 //   error_code(::cudaGetLastError(), cuda_category())
 
 // XXX N3000 prefers enum class errc { ... }
+/*! Namespace for CUDA Runtime errors.
+ */
 namespace errc
 {
 
@@ -58,7 +55,6 @@ enum errc_t
   memory_allocation                  = cudaErrorMemoryAllocation,
   initialization_error               = cudaErrorInitializationError,
   launch_failure                     = cudaErrorLaunchFailure,
-  prior_launch_failure               = cudaErrorPriorLaunchFailure,
   launch_timeout                     = cudaErrorLaunchTimeout,
   launch_out_of_resources            = cudaErrorLaunchOutOfResources,
   invalid_device_function            = cudaErrorInvalidDeviceFunction,
@@ -69,23 +65,14 @@ enum errc_t
   invalid_symbol                     = cudaErrorInvalidSymbol,
   map_buffer_object_failed           = cudaErrorMapBufferObjectFailed,
   unmap_buffer_object_failed         = cudaErrorUnmapBufferObjectFailed,
-  invalid_host_pointer               = cudaErrorInvalidHostPointer,
-  invalid_device_pointer             = cudaErrorInvalidDevicePointer,
   invalid_texture                    = cudaErrorInvalidTexture,
   invalid_texture_binding            = cudaErrorInvalidTextureBinding,
   invalid_channel_descriptor         = cudaErrorInvalidChannelDescriptor,
   invalid_memcpy_direction           = cudaErrorInvalidMemcpyDirection,
-  address_of_constant_error          = cudaErrorAddressOfConstant,
-  texture_fetch_failed               = cudaErrorTextureFetchFailed,
-  texture_not_bound                  = cudaErrorTextureNotBound,
-  synchronization_error              = cudaErrorSynchronizationError,
   invalid_filter_setting             = cudaErrorInvalidFilterSetting,
   invalid_norm_setting               = cudaErrorInvalidNormSetting,
-  mixed_device_execution             = cudaErrorMixedDeviceExecution,
   cuda_runtime_unloading             = cudaErrorCudartUnloading,
   unknown                            = cudaErrorUnknown,
-  not_yet_implemented                = cudaErrorNotYetImplemented,
-  memory_value_too_large             = cudaErrorMemoryValueTooLarge,
   invalid_resource_handle            = cudaErrorInvalidResourceHandle,
   not_ready                          = cudaErrorNotReady,
   insufficient_driver                = cudaErrorInsufficientDriver,
@@ -131,7 +118,7 @@ enum errc_t
 
 } // end namespace errc
 
-} // end namespace cuda
+} // end namespace cuda_cub
 
 /*! \return A reference to an object of a type derived from class \p thrust::error_category.
  *  \note The object's \p equivalent virtual functions shall behave as specified
@@ -164,23 +151,22 @@ inline error_code make_error_code(cuda::errc::errc_t e);
  */
 inline error_condition make_error_condition(cuda::errc::errc_t e);
 
-/*! \} // end system
- */
-
-
 } // end system
 
-namespace cuda
+namespace cuda_cub
 {
+namespace errc = system::cuda::errc;
+} // end cuda_cub
 
+namespace cuda
+{
 // XXX replace with using system::cuda_errc upon c++0x
 namespace errc = system::cuda::errc;
-
 } // end cuda
 
 using system::cuda_category;
 
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/error.inl>
 
diff --git a/thrust/system/cuda/execution_policy.h b/thrust/system/cuda/execution_policy.h
index 18d38faa9..c171ac3d9 100644
--- a/thrust/system/cuda/execution_policy.h
+++ b/thrust/system/cuda/execution_policy.h
@@ -1,220 +1,31 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
+/******************************************************************************
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
 #pragma once
 
-/*! \file thrust/system/cuda/execution_policy.h
- *  \brief Execution policies for Thrust's CUDA system.
- */
-
 #include <thrust/detail/config.h>
-
-// get the execution policies definitions first
 #include <thrust/system/cuda/detail/execution_policy.h>
-
-// get the definition of par
 #include <thrust/system/cuda/detail/par.h>
-
-// now get all the algorithm defintitions
-
-// the order of the following #includes seems to matter, unfortunately
-
-// primitives come first, in order of increasing sophistication
-#include <thrust/system/cuda/detail/get_value.h>
-#include <thrust/system/cuda/detail/assign_value.h>
-#include <thrust/system/cuda/detail/iter_swap.h>
-
-#include <thrust/system/cuda/detail/for_each.h>
-#include <thrust/system/cuda/detail/copy.h>
-#include <thrust/system/cuda/detail/reduce.h>
-#include <thrust/system/cuda/detail/scan.h>
-#include <thrust/system/cuda/detail/sort.h>
-
-// these are alphabetical
-#include <thrust/system/cuda/detail/adjacent_difference.h>
-#include <thrust/system/cuda/detail/assign_value.h>
-#include <thrust/system/cuda/detail/binary_search.h>
-#include <thrust/system/cuda/detail/copy_if.h>
-#include <thrust/system/cuda/detail/count.h>
-#include <thrust/system/cuda/detail/equal.h>
-#include <thrust/system/cuda/detail/extrema.h>
-#include <thrust/system/cuda/detail/fill.h>
-#include <thrust/system/cuda/detail/find.h>
-#include <thrust/system/cuda/detail/gather.h>
-#include <thrust/system/cuda/detail/generate.h>
-#include <thrust/system/cuda/detail/inner_product.h>
-#include <thrust/system/cuda/detail/iter_swap.h>
-#include <thrust/system/cuda/detail/logical.h>
-#include <thrust/system/cuda/detail/malloc_and_free.h>
-#include <thrust/system/cuda/detail/merge.h>
-#include <thrust/system/cuda/detail/mismatch.h>
-#include <thrust/system/cuda/detail/partition.h>
-#include <thrust/system/cuda/detail/reduce_by_key.h>
-#include <thrust/system/cuda/detail/remove.h>
-#include <thrust/system/cuda/detail/replace.h>
-#include <thrust/system/cuda/detail/reverse.h>
-#include <thrust/system/cuda/detail/scan_by_key.h>
-#include <thrust/system/cuda/detail/scatter.h>
-#include <thrust/system/cuda/detail/sequence.h>
-#include <thrust/system/cuda/detail/set_operations.h>
-#include <thrust/system/cuda/detail/sort.h>
-#include <thrust/system/cuda/detail/swap_ranges.h>
-#include <thrust/system/cuda/detail/tabulate.h>
-#include <thrust/system/cuda/detail/transform.h>
-#include <thrust/system/cuda/detail/transform_reduce.h>
-#include <thrust/system/cuda/detail/transform_scan.h>
-#include <thrust/system/cuda/detail/uninitialized_copy.h>
-#include <thrust/system/cuda/detail/uninitialized_fill.h>
-#include <thrust/system/cuda/detail/unique.h>
-#include <thrust/system/cuda/detail/unique_by_key.h>
-
-
-// define these entities here for the purpose of Doxygenating them
-// they are actually defined elsewhere
-#if 0
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-
-
-/*! \addtogroup execution_policies
- *  \{
- */
-
-
-/*! \p thrust::cuda::execution_policy is the base class for all Thrust parallel execution
- *  policies which are derived from Thrust's CUDA backend system.
- */
-template<typename DerivedPolicy>
-struct execution_policy : thrust::execution_policy<DerivedPolicy>
-{};
-
-
-/*! \p cuda::tag is a type representing Thrust's CUDA backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p cuda system.
- */
-struct tag : thrust::system::cuda::execution_policy<tag> { unspecified };
-
-
-/*! \p thrust::cuda::par is the parallel execution policy associated with Thrust's CUDA
- *  backend system.
- *
- *  Instead of relying on implicit algorithm dispatch through iterator system tags, users may
- *  directly target Thrust's CUDA backend system by providing \p thrust::cuda::par as an algorithm
- *  parameter.
- *
- *  Explicit dispatch can be useful in avoiding the introduction of data copies into containers such
- *  as \p thrust::cuda::vector.
- *
- *  The type of \p thrust::cuda::par is implementation-defined.
- *
- *  The following code snippet demonstrates how to use \p thrust::cuda::par to explicitly dispatch an
- *  invocation of \p thrust::for_each to the CUDA backend system:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/cuda/execution_policy.h>
- *  #include <cstdio>
- *
- *  struct printf_functor
- *  {
- *    __host__ __device__
- *    void operator()(int x)
- *    {
- *      printf("%d\n");
- *    }
- *  };
- *  ...
- *  int vec[3];
- *  vec[0] = 0; vec[1] = 1; vec[2] = 2;
- *
- *  thrust::for_each(thrust::cuda::par, vec.begin(), vec.end(), printf_functor());
- *
- *  // 0 1 2 is printed to standard output in some unspecified order
- *  \endcode
- *
- *  Explicit dispatch may also be used to direct Thrust's CUDA backend to launch CUDA kernels implementing
- *  an algorithm invocation on a particular CUDA stream. In some cases, this may achieve concurrency with the
- *  caller and other algorithms and CUDA kernels executing on a separate CUDA stream. The following code
- *  snippet demonstrates how to use the \p thrust::cuda::par execution policy to explicitly dispatch invocations
- *  of \p thrust::for_each on separate CUDA streams:
- *
- *  \code
- *  #include <thrust/for_each.h>
- *  #include <thrust/system/cuda/execution_policy.h>
- *
- *  struct printf_functor
- *  {
- *    cudaStream_t s;
- *
- *    printf_functor(cudaStream_t s) : s(s) {}
- *
- *    __host__ __device__
- *    void operator()(int)
- *    {
- *      printf("Hello, world from stream %p\n", static_cast<void*>(s));
- *    }
- *  };
- *
- *  int main()
- *  {
- *    // create two CUDA streams
- *    cudaStream_t s1, s2;
- *    cudaStreamCreate(&s1);
- *    cudaStreamCreate(&s2);
- *  
- *    thrust::counting_iterator<int> iter(0);
- *  
- *    // execute for_each on two different streams
- *    thrust::for_each(thrust::cuda::par.on(s1), iter, iter + 1, printf_functor(s1));
- *    thrust::for_each(thrust::cuda::par.on(s2), iter, iter + 1, printf_functor(s2));
- *  
- *    // synchronize with both streams
- *    cudaStreamSynchronize(s1);
- *    cudaStreamSynchronize(s2);
- *  
- *    // destroy streams
- *    cudaStreamDestroy(s1);
- *    cudaStreamDestroy(s2);
- *  
- *    return 0;
- *  }
- *  \endcode
- *
- *  Even when using CUDA streams with \p thrust::cuda::par.on(), there is no guarantee of concurrency. Algorithms
- *  which return a data-dependent result or whose implementations require temporary memory allocation may
- *  cause blocking synchronization events. Moreover, it may be necessary to explicitly synchronize through
- *  \p cudaStreamSynchronize or similar before any effects induced through algorithm execution are visible to
- *  the rest of the system. Finally, it is the responsibility of the caller to own the lifetime of any CUDA
- *  streams involved.
- */
-static const unspecified par;
-
-
-/*! \}
- */
-
-
-} // end cuda
-} // end system
-} // end thrust
-#endif
-
-
diff --git a/thrust/system/cuda/experimental/pinned_allocator.h b/thrust/system/cuda/experimental/pinned_allocator.h
deleted file mode 100644
index 98e47aee1..000000000
--- a/thrust/system/cuda/experimental/pinned_allocator.h
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-/*! \file thrust/system/cuda/experimental/pinned_allocator.h
- *  \brief An allocator which creates new elements in "pinned" memory with \p cudaMallocHost
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
-#include <stdexcept>
-#include <limits>
-#include <string>
-#include <thrust/system/system_error.h>
-#include <thrust/system/cuda/error.h>
-
-namespace thrust
-{
-
-namespace system
-{
-
-namespace cuda
-{
-
-namespace experimental
-{
-
-/*! \addtogroup memory_management Memory Management
- *  \addtogroup memory_management_classes
- *  \ingroup memory_management
- *  \{
- */
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- *  \see http://www.sgi.com/tech/stl/Allocators.html
- */
-template<typename T> class pinned_allocator;
-
-template<>
-  class pinned_allocator<void>
-{
-  public:
-    typedef void           value_type;
-    typedef void       *   pointer;
-    typedef const void *   const_pointer;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<void> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-}; // end pinned_allocator
-
-
-template<typename T>
-  class pinned_allocator
-{
-  public:
-    typedef T              value_type;
-    typedef T*             pointer;
-    typedef const T*       const_pointer;
-    typedef T&             reference;
-    typedef const T&       const_reference;
-    typedef std::size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-
-    // convert a pinned_allocator<T> to pinned_allocator<U>
-    template<typename U>
-      struct rebind
-    {
-      typedef pinned_allocator<U> other;
-    }; // end rebind
-
-    /*! \p pinned_allocator's null constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator() {}
-
-    /*! \p pinned_allocator's null destructor does nothing.
-     */
-    __host__ __device__
-    inline ~pinned_allocator() {}
-
-    /*! \p pinned_allocator's copy constructor does nothing.
-     */
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator const &) {}
-
-    /*! This version of \p pinned_allocator's copy constructor
-     *  is templated on the \c value_type of the \p pinned_allocator
-     *  to copy from.  It is provided merely for convenience; it
-     *  does nothing.
-     */
-    template<typename U>
-    __host__ __device__
-    inline pinned_allocator(pinned_allocator<U> const &) {}
-
-    /*! This method returns the address of a \c reference of
-     *  interest.
-     *
-     *  \p r The \c reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline pointer address(reference r) { return &r; }
-
-    /*! This method returns the address of a \c const_reference
-     *  of interest.
-     *
-     *  \p r The \c const_reference of interest.
-     *  \return \c r's address.
-     */
-    __host__ __device__
-    inline const_pointer address(const_reference r) { return &r; }
-
-    /*! This method allocates storage for objects in pinned host
-     *  memory.
-     *
-     *  \p cnt The number of objects to allocate.
-     *  \return a \c pointer to the newly allocated objects.
-     *  \note This method does not invoke \p value_type's constructor.
-     *        It is the responsibility of the caller to initialize the
-     *        objects at the returned \c pointer. 
-     */
-    __host__
-    inline pointer allocate(size_type cnt,
-                            const_pointer = 0)
-    {
-      if(cnt > this->max_size())
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      pointer result(0);
-      cudaError_t error = cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type));
-
-      if(error)
-      {
-        throw std::bad_alloc();
-      } // end if
-
-      return result;
-    } // end allocate()
-
-    /*! This method deallocates pinned host memory previously allocated
-     *  with this \c pinned_allocator.
-     *
-     *  \p p A \c pointer to the previously allocated memory.
-     *  \p cnt The number of objects previously allocated at
-     *         \p p.
-     *  \note This method does not invoke \p value_type's destructor.
-     *        It is the responsibility of the caller to destroy
-     *        the objects stored at \p p.
-     */
-    __host__
-    inline void deallocate(pointer p, size_type cnt)
-    {
-      cudaError_t error = cudaFreeHost(p);
-      
-      if(error)
-      {
-        throw thrust::system_error(error, thrust::cuda_category());
-      } // end if
-    } // end deallocate()
-
-    /*! This method returns the maximum size of the \c cnt parameter
-     *  accepted by the \p allocate() method.
-     *
-     *  \return The maximum number of objects that may be allocated
-     *          by a single call to \p allocate().
-     */
-    inline size_type max_size() const
-    {
-      return (std::numeric_limits<size_type>::max)() / sizeof(T);
-    } // end max_size()
-
-    /*! This method tests this \p pinned_allocator for equality to
-     *  another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c true.
-     */
-    __host__ __device__
-    inline bool operator==(pinned_allocator const& x) { return true; }
-
-    /*! This method tests this \p pinned_allocator for inequality
-     *  to another.
-     *
-     *  \param x The other \p pinned_allocator of interest.
-     *  \return This method always returns \c false.
-     */
-    __host__ __device__
-    inline bool operator!=(pinned_allocator const &x) { return !operator==(x); }
-}; // end pinned_allocator
-
-/*! \}
- */
-
-} // end experimental
-
-} // end cuda
-
-} // end system
-
-// alias cuda's members at top-level
-namespace cuda
-{
-
-namespace experimental
-{
-
-using thrust::system::cuda::experimental::pinned_allocator;
-
-} // end experimental
-
-} // end cuda
-
-} // end thrust
-
diff --git a/thrust/system/cuda/future.h b/thrust/system/cuda/future.h
new file mode 100644
index 000000000..79bfc9134
--- /dev/null
+++ b/thrust/system/cuda/future.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2018 NVIDIA Corporation
+// Author: Bryce Adelstein Lelbach <brycelelbach@gmail.com>
+//
+// Distributed under the Boost Software License v1.0 (boost.org/LICENSE_1_0.txt)
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp14_required.h>
+
+#if THRUST_CPP_DIALECT >= 2014
+
+#include <thrust/system/cuda/pointer.h>
+#include <thrust/system/cuda/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system { namespace cuda
+{
+
+struct ready_event;
+
+template <typename T>
+struct ready_future;
+
+struct unique_eager_event;
+
+template <typename T>
+struct unique_eager_future;
+
+template <typename... Events>
+__host__
+unique_eager_event when_all(Events&&... evs);
+
+}} // namespace system::cuda
+
+namespace cuda
+{
+
+using thrust::system::cuda::ready_event;
+
+using thrust::system::cuda::ready_future;
+
+using thrust::system::cuda::unique_eager_event;
+using event = unique_eager_event;
+
+using thrust::system::cuda::unique_eager_future;
+template <typename T> using future = unique_eager_future<T>;
+
+using thrust::system::cuda::when_all;
+
+} // namespace cuda
+
+template <typename DerivedPolicy>
+__host__ 
+thrust::cuda::unique_eager_event
+unique_eager_event_type(
+  thrust::cuda::execution_policy<DerivedPolicy> const&
+) noexcept;
+
+template <typename T, typename DerivedPolicy>
+__host__ 
+thrust::cuda::unique_eager_future<T>
+unique_eager_future_type(
+  thrust::cuda::execution_policy<DerivedPolicy> const&
+) noexcept;
+
+THRUST_NAMESPACE_END
+
+#include <thrust/system/cuda/detail/future.inl>
+
+#endif // C++14
+
diff --git a/thrust/system/cuda/memory.h b/thrust/system/cuda/memory.h
index cfd91a950..eb8020adb 100644
--- a/thrust/system/cuda/memory.h
+++ b/thrust/system/cuda/memory.h
@@ -1,8 +1,8 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -21,295 +21,15 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/cuda/execution_policy.h>
+#include <thrust/system/cuda/memory_resource.h>
 #include <thrust/memory.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
 {
-namespace system
-{
-namespace cuda
-{
-
-template<typename> class pointer;
-
-} // end cuda
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::cuda::pointer<Element> >
-{
-  private:
-    typedef thrust::system::cuda::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::cuda
- *  \brief \p thrust::system::cuda is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's CUDA backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::cuda</tt>
- *         namespace for easy access.
- *
- */
-namespace cuda
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::cuda::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-#if 0
-/*! \p cuda::tag is type representing Thrust's CUDA backend system in C++'s type system.
- *  Iterators "tagged" with a type which is convertible to \p cuda::tag assert that they may be
- *  "dispatched" to algorithm implementations in the \p cuda system.
- */
-struct tag { unspecified };
-#endif
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the cuda system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in cuda memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p cuda::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see cuda::malloc
- *  \see cuda::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::cuda::tag,
-               thrust::system::cuda::reference<T>,
-               thrust::system::cuda::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::cuda::tag,
-      //thrust::system::cuda::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::cuda::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p cuda system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::cuda::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p cuda system.
- *  \p reference is the type of the result of dereferencing a \p cuda::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::cuda::pointer<T>,
-               thrust::system::cuda::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::cuda::pointer<T>,
-      thrust::system::cuda::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    __host__ __device__
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
 
 /*! Allocates an area of memory available to Thrust's <tt>cuda</tt> system.
  *  \param n Number of bytes to allocate.
@@ -321,22 +41,20 @@ void swap(reference<T> x, reference<T> y);
  *  \see cuda::free
  *  \see std::malloc
  */
-inline __host__ __device__
-pointer<void> malloc(std::size_t n);
+inline __host__ __device__ pointer<void> malloc(std::size_t n);
 
 /*! Allocates a typed area of memory available to Thrust's <tt>cuda</tt> system.
  *  \param n Number of elements to allocate.
  *  \return A <tt>cuda::pointer<T></tt> pointing to the beginning of the newly
- *          allocated memory. A null <tt>cuda::pointer<T></tt> is returned if
+ *          allocated elements. A null <tt>cuda::pointer<T></tt> is returned if
  *          an error occurs.
  *  \note The <tt>cuda::pointer<T></tt> returned by this function must be
  *        deallocated with \p cuda::free.
  *  \see cuda::free
  *  \see std::malloc
  */
-template<typename T>
-inline __host__ __device__
-pointer<T> malloc(std::size_t n);
+template <typename T>
+inline __host__ __device__ pointer<T> malloc(std::size_t n);
 
 /*! Deallocates an area of memory previously allocated by <tt>cuda::malloc</tt>.
  *  \param ptr A <tt>cuda::pointer<void></tt> pointing to the beginning of an area
@@ -344,81 +62,48 @@ pointer<T> malloc(std::size_t n);
  *  \see cuda::malloc
  *  \see std::free
  */
-inline __host__ __device__
-void free(pointer<void> ptr);
-
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
+inline __host__ __device__ void free(pointer<void> ptr);
 
-/*! \p cuda::allocator is the default allocator used by the \p cuda system's containers such as
- *  <tt>cuda::vector</tt> if no user-specified allocator is provided. \p cuda::allocator allocates
- *  (deallocates) storage with \p cuda::malloc (\p cuda::free).
+/*! \p cuda::allocator is the default allocator used by the \p cuda system's
+ *  containers such as <tt>cuda::vector</tt> if no user-specified allocator is
+ *  provided. \p cuda::allocator allocates (deallocates) storage with \p
+ *  cuda::malloc (\p cuda::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
-
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end cuda
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cuda::memory_resource
+>;
 
-/*! \}
+/*! \p cuda::universal_allocator allocates memory that can be used by the \p cuda
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::cuda::universal_memory_resource
+>;
+
+} // namespace cuda_cub
 
-} // end system
+namespace system { namespace cuda
+{
+using thrust::cuda_cub::malloc;
+using thrust::cuda_cub::free;
+using thrust::cuda_cub::allocator;
+using thrust::cuda_cub::universal_allocator;
+}} // namespace system::cuda
 
 /*! \namespace thrust::cuda
- *  \brief \p thrust::cuda is a top-level alias for thrust::system::cuda.
+ *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda.
  */
 namespace cuda
 {
+using thrust::cuda_cub::malloc;
+using thrust::cuda_cub::free;
+using thrust::cuda_cub::allocator;
+using thrust::cuda_cub::universal_allocator;
+} // namespace cuda
 
-using thrust::system::cuda::pointer;
-using thrust::system::cuda::reference;
-using thrust::system::cuda::malloc;
-using thrust::system::cuda::free;
-using thrust::system::cuda::allocator;
-
-} // end cuda
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/cuda/detail/memory.inl>
 
diff --git a/thrust/system/cuda/memory_resource.h b/thrust/system/cuda/memory_resource.h
new file mode 100644
index 000000000..4bf534e40
--- /dev/null
+++ b/thrust/system/cuda/memory_resource.h
@@ -0,0 +1,126 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file cuda/memory_resource.h
+ *  \brief Memory resources for the CUDA system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/mr/memory_resource.h>
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+#include <thrust/system/cuda/pointer.h>
+#include <thrust/system/detail/bad_alloc.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system/cuda/detail/util.h>
+
+#include <thrust/mr/host_memory_resource.h>
+
+THRUST_NAMESPACE_BEGIN
+
+namespace system
+{
+namespace cuda
+{
+
+//! \cond
+namespace detail
+{
+
+    typedef cudaError_t (CUDARTAPI *allocation_fn)(void **, std::size_t);
+    typedef cudaError_t (CUDARTAPI *deallocation_fn)(void *);
+
+    template<allocation_fn Alloc, deallocation_fn Dealloc, typename Pointer>
+    class cuda_memory_resource final : public mr::memory_resource<Pointer>
+    {
+    public:
+        Pointer do_allocate(std::size_t bytes, std::size_t alignment = THRUST_MR_DEFAULT_ALIGNMENT) override
+        {
+            (void)alignment;
+
+            void * ret;
+            cudaError_t status = Alloc(&ret, bytes);
+
+            if (status != cudaSuccess)
+            {
+                cudaGetLastError(); // Clear the CUDA global error state.
+                throw thrust::system::detail::bad_alloc(thrust::cuda_category().message(status).c_str());
+            }
+
+            return Pointer(ret);
+        }
+
+        void do_deallocate(Pointer p, std::size_t bytes, std::size_t alignment) override
+        {
+            (void)bytes;
+            (void)alignment;
+
+            cudaError_t status = Dealloc(thrust::detail::pointer_traits<Pointer>::get(p));
+
+            if (status != cudaSuccess)
+            {
+                thrust::cuda_cub::throw_on_error(status, "CUDA free failed");
+            }
+        }
+    };
+
+    inline cudaError_t CUDARTAPI cudaMallocManaged(void ** ptr, std::size_t bytes)
+    {
+        return ::cudaMallocManaged(ptr, bytes, cudaMemAttachGlobal);
+    }
+
+    typedef detail::cuda_memory_resource<cudaMalloc, cudaFree,
+        thrust::cuda::pointer<void> >
+        device_memory_resource;
+    typedef detail::cuda_memory_resource<detail::cudaMallocManaged, cudaFree,
+        thrust::cuda::universal_pointer<void> >
+        managed_memory_resource;
+    typedef detail::cuda_memory_resource<cudaMallocHost, cudaFreeHost,
+        thrust::cuda::universal_pointer<void> >
+        pinned_memory_resource;
+
+} // end detail
+//! \endcond
+
+/*! The memory resource for the CUDA system. Uses <tt>cudaMalloc</tt> and wraps
+ *  the result with \p cuda::pointer.
+ */
+typedef detail::device_memory_resource memory_resource;
+/*! The universal memory resource for the CUDA system. Uses
+ *  <tt>cudaMallocManaged</tt> and wraps the result with
+ *  \p cuda::universal_pointer.
+ */
+typedef detail::managed_memory_resource universal_memory_resource;
+/*! The host pinned memory resource for the CUDA system. Uses
+ *  <tt>cudaMallocHost</tt> and wraps the result with \p
+ *  cuda::universal_pointer.
+ */
+typedef detail::pinned_memory_resource universal_host_pinned_memory_resource;
+
+} // end cuda
+} // end system
+
+namespace cuda
+{
+using thrust::system::cuda::memory_resource;
+using thrust::system::cuda::universal_memory_resource;
+using thrust::system::cuda::universal_host_pinned_memory_resource;
+}
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cuda/pointer.h b/thrust/system/cuda/pointer.h
new file mode 100644
index 000000000..ace77fbae
--- /dev/null
+++ b/thrust/system/cuda/pointer.h
@@ -0,0 +1,137 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/cuda/memory.h
+ *  \brief Managing memory associated with Thrust's Standard C++ system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <type_traits>
+#include <thrust/system/cuda/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
+{
+
+/*! \p cuda::pointer stores a pointer to an object allocated in memory
+ *  accessible by the \p cuda system. This type provides type safety when
+ *  dispatching algorithms on ranges resident in \p cuda memory.
+ *
+ *  \p cuda::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p cuda::pointer can be created with the function \p cuda::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cuda::pointer may be obtained by eiter
+ *  its <tt>get</tt> member function or the \p raw_pointer_cast function.
+ *
+ *  \note \p cuda::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p cuda::pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cuda::malloc
+ *  \see cuda::free
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::cuda_cub::tag,
+  thrust::tagged_reference<T, thrust::cuda_cub::tag>
+>;
+
+/*! \p cuda::universal_pointer stores a pointer to an object allocated in
+ *  memory accessible by the \p cuda system and host systems.
+ *
+ *  \p cuda::universal_pointer has pointer semantics: it may be dereferenced
+ *  and manipulated with pointer arithmetic.
+ *
+ *  \p cuda::universal_pointer can be created with \p cuda::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p cuda::universal_pointer may be
+ *  obtained by eiter its <tt>get</tt> member function or the \p
+ *  raw_pointer_cast function.
+ *
+ *  \note \p cuda::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p cuda::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see cuda::universal_allocator
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::cuda_cub::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p cuda::reference is a wrapped reference to an object stored in memory
+ *  accessible by the \p cuda system. \p cuda::reference is the type of the
+ *  result of dereferencing a \p cuda::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ *
+ *  \see cuda::pointer
+ */
+template <typename T>
+using reference = thrust::tagged_reference<T, thrust::cuda_cub::tag>;
+
+} // namespace cuda_cub
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::system::cuda
+ *  \brief \p thrust::system::cuda is the namespace containing functionality
+ *  for allocating, manipulating, and deallocating memory available to Thrust's
+ *  CUDA backend system. The identifiers are provided in a separate namespace
+ *  underneath \p thrust::system for import convenience but are also
+ *  aliased in the top-level <tt>thrust::cuda</tt> namespace for easy access.
+ *
+ */
+namespace system { namespace cuda
+{
+using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::universal_pointer;
+using thrust::cuda_cub::reference;
+}} // namespace system::cuda
+/*! \}
+ */
+
+/*! \namespace thrust::cuda
+ *  \brief \p thrust::cuda is a top-level alias for \p thrust::system::cuda.
+ */
+namespace cuda
+{
+using thrust::cuda_cub::pointer;
+using thrust::cuda_cub::universal_pointer;
+using thrust::cuda_cub::reference;
+} // namespace cuda
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/cuda/vector.h b/thrust/system/cuda/vector.h
index c168da6e8..fafc7bf17 100644
--- a/thrust/system/cuda/vector.h
+++ b/thrust/system/cuda/vector.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ccudaliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -26,123 +26,64 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace cuda_cub
 {
 
-// forward declaration of host_vector
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace cuda
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
 /*! \p cuda::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p cuda::vector may vary dynamically; memory management is
  *  automatic. The elements contained in a \p cuda::vector reside in memory
- *  available to the \p cuda system.
+ *  accessible by the \p cuda system.
  *
  *  \tparam T The element type of the \p cuda::vector.
- *  \tparam Allocator The allocator type of the \p cuda::vector. Defaults to \p cuda::allocator.
+ *  \tparam Allocator The allocator type of the \p cuda::vector.
+ *          Defaults to \p cuda::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
  *                   shared by \p cuda::vector
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p cuda::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p cuda::vector with \p n default-constructed elements.
-     *  \param n The size of the \p cuda::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p cuda::vector with \p n copies of \p value.
-     *  \param n The size of the \p cuda::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p cuda::vector.
-     *  \param x The other \p cuda::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p cuda::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-    //
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
+template <typename T, typename Allocator = thrust::system::cuda::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p cuda::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p cuda::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p cuda::universal_vector reside in memory accessible by the \p cuda system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p cuda::universal_vector.
+ *  \tparam Allocator The allocator type of the \p cuda::universal_vector.
+ *          Defaults to \p cuda::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p cuda::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::cuda::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+} // namespace cuda_cub
 
-} // end cuda
-} // end system
+namespace system { namespace cuda
+{
+using thrust::cuda_cub::vector;
+using thrust::cuda_cub::universal_vector;
+}}
 
-// alias system::cuda names at top-level
 namespace cuda
 {
+using thrust::cuda_cub::vector;
+using thrust::cuda_cub::universal_vector;
+}
 
-using thrust::system::cuda::vector;
-
-} // end cuda
-
-} // end thrust
-
-#include <thrust/system/cuda/detail/vector.inl>
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/adl/adjacent_difference.h b/thrust/system/detail/adl/adjacent_difference.h
index 68bc08560..c6f6c7282 100644
--- a/thrust/system/detail/adl/adjacent_difference.h
+++ b/thrust/system/detail/adl/adjacent_difference.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/adjacent_difference.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/adjacent_difference.h>
+#include <thrust/system/cuda/detail/adjacent_difference.h>
+#include <thrust/system/omp/detail/adjacent_difference.h>
+#include <thrust/system/tbb/detail/adjacent_difference.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/adjacent_difference.h>
 #include __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
 #undef __THRUST_HOST_SYSTEM_ADJACENT_DIFFERENCE_HEADER
diff --git a/thrust/system/detail/adl/assign_value.h b/thrust/system/detail/adl/assign_value.h
index 192e7ea36..d38934aff 100644
--- a/thrust/system/detail/adl/assign_value.h
+++ b/thrust/system/detail/adl/assign_value.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/assign_value.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/assign_value.h>
+#include <thrust/system/cuda/detail/assign_value.h>
+#include <thrust/system/omp/detail/assign_value.h>
+#include <thrust/system/tbb/detail/assign_value.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/assign_value.h>
 #include __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
 #undef __THRUST_HOST_SYSTEM_ASSIGN_VALUE_HEADER
diff --git a/thrust/system/cuda/detail/default_decomposition.inl b/thrust/system/detail/adl/async/copy.h
similarity index 50%
rename from thrust/system/cuda/detail/default_decomposition.inl
rename to thrust/system/detail/adl/async/copy.h
index 7c515c5c3..72debb3b6 100644
--- a/thrust/system/cuda/detail/default_decomposition.inl
+++ b/thrust/system/detail/adl/async/copy.h
@@ -14,31 +14,21 @@
  *  limitations under the License.
  */
 
-#include <thrust/detail/config.h>
-#include <thrust/system/cuda/detail/runtime_introspection.h>
+// The purpose of this header is to #include the async/copy.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async copy.
 
-namespace thrust
-{
-namespace system
-{
-namespace cuda
-{
-namespace detail
-{
+#pragma once
 
+#include <thrust/detail/config.h>
 
-template<typename IndexType>
-__host__ __device__
-thrust::system::detail::internal::uniform_decomposition<IndexType> default_decomposition(IndexType n)
-{
-  // TODO eliminate magical constant
-  device_properties_t properties = device_properties();
-  return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, properties.maxThreadsPerBlock, 10 * properties.multiProcessorCount);
-}
+//#include <thrust/system/detail/sequential/async/copy.h>
 
+//#define __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/copy.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_COPY_HEADER
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
+#define __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/copy.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_COPY_HEADER
 
diff --git a/thrust/system/detail/adl/async/for_each.h b/thrust/system/detail/adl/async/for_each.h
new file mode 100644
index 000000000..08347f659
--- /dev/null
+++ b/thrust/system/detail/adl/async/for_each.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/for_each.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async for_each.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/for_each.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/for_each.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_FOR_EACH_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/for_each.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_FOR_EACH_HEADER
+
diff --git a/thrust/system/detail/adl/async/reduce.h b/thrust/system/detail/adl/async/reduce.h
new file mode 100644
index 000000000..f13ab02fd
--- /dev/null
+++ b/thrust/system/detail/adl/async/reduce.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/reduce.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async reduce.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/reduce.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/reduce.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_REDUCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/reduce.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_REDUCE_HEADER
+
diff --git a/thrust/system/detail/adl/async/scan.h b/thrust/system/detail/adl/async/scan.h
new file mode 100644
index 000000000..a2a90618b
--- /dev/null
+++ b/thrust/system/detail/adl/async/scan.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/scan.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async scans.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/scan.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/scan.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SCAN_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/scan.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SCAN_HEADER
+
diff --git a/thrust/system/detail/adl/async/sort.h b/thrust/system/detail/adl/async/sort.h
new file mode 100644
index 000000000..c3a83ad40
--- /dev/null
+++ b/thrust/system/detail/adl/async/sort.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/sort.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async sort.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/sort.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/sort.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_SORT_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/sort.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_SORT_HEADER
+
diff --git a/thrust/system/detail/adl/async/transform.h b/thrust/system/detail/adl/async/transform.h
new file mode 100644
index 000000000..abb2163ea
--- /dev/null
+++ b/thrust/system/detail/adl/async/transform.h
@@ -0,0 +1,34 @@
+/*
+ *  Copyright 2008-2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a transform of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+// The purpose of this header is to #include the async/transform.h header of the
+// sequential, host, and device systems. It should be #included in any code
+// which uses ADL to dispatch async transform.
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+//#include <thrust/system/detail/sequential/async/transform.h>
+
+//#define __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/async/transform.h>
+//#include __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER
+//#undef __THRUST_HOST_SYSTEM_ASYNC_TRANSFORM_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/async/transform.h>
+#include __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER
+#undef __THRUST_DEVICE_SYSTEM_ASYNC_TRANSFORM_HEADER
+
diff --git a/thrust/system/detail/adl/binary_search.h b/thrust/system/detail/adl/binary_search.h
index 37fa75066..2f9ac06df 100644
--- a/thrust/system/detail/adl/binary_search.h
+++ b/thrust/system/detail/adl/binary_search.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/binary_search.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/binary_search.h>
+#include <thrust/system/cuda/detail/binary_search.h>
+#include <thrust/system/omp/detail/binary_search.h>
+#include <thrust/system/tbb/detail/binary_search.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/binary_search.h>
 #include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
 #undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
diff --git a/thrust/system/detail/adl/copy.h b/thrust/system/detail/adl/copy.h
index 4e3a0b809..0035b83ef 100644
--- a/thrust/system/detail/adl/copy.h
+++ b/thrust/system/detail/adl/copy.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/copy.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/copy.h>
+#include <thrust/system/cuda/detail/copy.h>
+#include <thrust/system/omp/detail/copy.h>
+#include <thrust/system/tbb/detail/copy.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy.h>
 #include __THRUST_HOST_SYSTEM_COPY_HEADER
 #undef __THRUST_HOST_SYSTEM_COPY_HEADER
diff --git a/thrust/system/detail/adl/copy_if.h b/thrust/system/detail/adl/copy_if.h
index eb73fb079..31adaf8e1 100644
--- a/thrust/system/detail/adl/copy_if.h
+++ b/thrust/system/detail/adl/copy_if.h
@@ -24,11 +24,21 @@
 
 #include <thrust/system/detail/sequential/copy_if.h>
 
-#define __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy_if.h>
-#include __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_HOST_SYSTEM_BINARY_SEARCH_HEADER
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/copy_if.h>
+#include <thrust/system/cuda/detail/copy_if.h>
+#include <thrust/system/omp/detail/copy_if.h>
+#include <thrust/system/tbb/detail/copy_if.h>
+#endif
 
-#define __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy_if.h>
-#include __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
-#undef __THRUST_DEVICE_SYSTEM_BINARY_SEARCH_HEADER
+#define __THRUST_HOST_SYSTEM_COPY_IF_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_HOST_SYSTEM_COPY_IF_HEADER
+#undef __THRUST_HOST_SYSTEM_COPY_IF_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/copy_if.h>
+#include __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER
+#undef __THRUST_DEVICE_SYSTEM_COPY_IF_HEADER
 
diff --git a/thrust/system/detail/adl/count.h b/thrust/system/detail/adl/count.h
index fb6f10669..5d6f1f748 100644
--- a/thrust/system/detail/adl/count.h
+++ b/thrust/system/detail/adl/count.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/count.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/count.h>
+#include <thrust/system/cuda/detail/count.h>
+#include <thrust/system/omp/detail/count.h>
+#include <thrust/system/tbb/detail/count.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_COUNT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/count.h>
 #include __THRUST_HOST_SYSTEM_COUNT_HEADER
 #undef __THRUST_HOST_SYSTEM_COUNT_HEADER
diff --git a/thrust/system/detail/adl/equal.h b/thrust/system/detail/adl/equal.h
index cbe673fa2..6b02e33b8 100644
--- a/thrust/system/detail/adl/equal.h
+++ b/thrust/system/detail/adl/equal.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/equal.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/equal.h>
+#include <thrust/system/cuda/detail/equal.h>
+#include <thrust/system/omp/detail/equal.h>
+#include <thrust/system/tbb/detail/equal.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_EQUAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/equal.h>
 #include __THRUST_HOST_SYSTEM_EQUAL_HEADER
 #undef __THRUST_HOST_SYSTEM_EQUAL_HEADER
diff --git a/thrust/system/detail/adl/extrema.h b/thrust/system/detail/adl/extrema.h
index 2af0caffa..62fb39be9 100644
--- a/thrust/system/detail/adl/extrema.h
+++ b/thrust/system/detail/adl/extrema.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/extrema.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/extrema.h>
+#include <thrust/system/cuda/detail/extrema.h>
+#include <thrust/system/omp/detail/extrema.h>
+#include <thrust/system/tbb/detail/extrema.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_EXTREMA_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/extrema.h>
 #include __THRUST_HOST_SYSTEM_EXTREMA_HEADER
 #undef __THRUST_HOST_SYSTEM_EXTREMA_HEADER
diff --git a/thrust/system/detail/adl/fill.h b/thrust/system/detail/adl/fill.h
index cbe33f7c9..f76a81b4f 100644
--- a/thrust/system/detail/adl/fill.h
+++ b/thrust/system/detail/adl/fill.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/fill.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/fill.h>
+#include <thrust/system/cuda/detail/fill.h>
+#include <thrust/system/omp/detail/fill.h>
+#include <thrust/system/tbb/detail/fill.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/fill.h>
 #include __THRUST_HOST_SYSTEM_FILL_HEADER
 #undef __THRUST_HOST_SYSTEM_FILL_HEADER
diff --git a/thrust/system/detail/adl/find.h b/thrust/system/detail/adl/find.h
index 89dbf468d..8d85e09a3 100644
--- a/thrust/system/detail/adl/find.h
+++ b/thrust/system/detail/adl/find.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/find.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/find.h>
+#include <thrust/system/cuda/detail/find.h>
+#include <thrust/system/omp/detail/find.h>
+#include <thrust/system/tbb/detail/find.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_FIND_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/find.h>
 #include __THRUST_HOST_SYSTEM_FIND_HEADER
 #undef __THRUST_HOST_SYSTEM_FIND_HEADER
diff --git a/thrust/system/detail/adl/for_each.h b/thrust/system/detail/adl/for_each.h
index 20dd8372e..8509edca3 100644
--- a/thrust/system/detail/adl/for_each.h
+++ b/thrust/system/detail/adl/for_each.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/for_each.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/for_each.h>
+#include <thrust/system/cuda/detail/for_each.h>
+#include <thrust/system/omp/detail/for_each.h>
+#include <thrust/system/tbb/detail/for_each.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_FOR_EACH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/for_each.h>
 #include __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
 #undef __THRUST_HOST_SYSTEM_FOR_EACH_HEADER
diff --git a/thrust/system/detail/adl/gather.h b/thrust/system/detail/adl/gather.h
index 7040f119a..242da3c90 100644
--- a/thrust/system/detail/adl/gather.h
+++ b/thrust/system/detail/adl/gather.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/gather.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/gather.h>
+#include <thrust/system/cuda/detail/gather.h>
+#include <thrust/system/omp/detail/gather.h>
+#include <thrust/system/tbb/detail/gather.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_GATHER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/gather.h>
 #include __THRUST_HOST_SYSTEM_GATHER_HEADER
 #undef __THRUST_HOST_SYSTEM_GATHER_HEADER
diff --git a/thrust/system/detail/adl/generate.h b/thrust/system/detail/adl/generate.h
index e19c4cd5e..5b1d7b4ba 100644
--- a/thrust/system/detail/adl/generate.h
+++ b/thrust/system/detail/adl/generate.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/generate.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/generate.h>
+#include <thrust/system/cuda/detail/generate.h>
+#include <thrust/system/omp/detail/generate.h>
+#include <thrust/system/tbb/detail/generate.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_GENERATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/generate.h>
 #include __THRUST_HOST_SYSTEM_GENERATE_HEADER
 #undef __THRUST_HOST_SYSTEM_GENERATE_HEADER
diff --git a/thrust/system/detail/adl/get_value.h b/thrust/system/detail/adl/get_value.h
index 78bccfc4a..306eb423e 100644
--- a/thrust/system/detail/adl/get_value.h
+++ b/thrust/system/detail/adl/get_value.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/get_value.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/get_value.h>
+#include <thrust/system/cuda/detail/get_value.h>
+#include <thrust/system/omp/detail/get_value.h>
+#include <thrust/system/tbb/detail/get_value.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_GET_VALUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/get_value.h>
 #include __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
 #undef __THRUST_HOST_SYSTEM_GET_VALUE_HEADER
diff --git a/thrust/system/detail/adl/inner_product.h b/thrust/system/detail/adl/inner_product.h
index fcefdf4c4..9423b1bdb 100644
--- a/thrust/system/detail/adl/inner_product.h
+++ b/thrust/system/detail/adl/inner_product.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/inner_product.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/inner_product.h>
+#include <thrust/system/cuda/detail/inner_product.h>
+#include <thrust/system/omp/detail/inner_product.h>
+#include <thrust/system/tbb/detail/inner_product.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/inner_product.h>
 #include __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
 #undef __THRUST_HOST_SYSTEM_INNER_PRODUCT_HEADER
diff --git a/thrust/system/detail/adl/iter_swap.h b/thrust/system/detail/adl/iter_swap.h
index 8716a2ff0..d9da52a62 100644
--- a/thrust/system/detail/adl/iter_swap.h
+++ b/thrust/system/detail/adl/iter_swap.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/iter_swap.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/iter_swap.h>
+#include <thrust/system/cuda/detail/iter_swap.h>
+#include <thrust/system/omp/detail/iter_swap.h>
+#include <thrust/system/tbb/detail/iter_swap.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/iter_swap.h>
 #include __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
 #undef __THRUST_HOST_SYSTEM_ITER_SWAP_HEADER
diff --git a/thrust/system/detail/adl/logical.h b/thrust/system/detail/adl/logical.h
index 313214e1a..bdaad4d29 100644
--- a/thrust/system/detail/adl/logical.h
+++ b/thrust/system/detail/adl/logical.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/logical.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/logical.h>
+#include <thrust/system/cuda/detail/logical.h>
+#include <thrust/system/omp/detail/logical.h>
+#include <thrust/system/tbb/detail/logical.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_LOGICAL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/logical.h>
 #include __THRUST_HOST_SYSTEM_LOGICAL_HEADER
 #undef __THRUST_HOST_SYSTEM_LOGICAL_HEADER
diff --git a/thrust/system/detail/adl/malloc_and_free.h b/thrust/system/detail/adl/malloc_and_free.h
index 1d36e8c50..c36db0270 100644
--- a/thrust/system/detail/adl/malloc_and_free.h
+++ b/thrust/system/detail/adl/malloc_and_free.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/malloc_and_free.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/malloc_and_free.h>
+#include <thrust/system/cuda/detail/malloc_and_free.h>
+#include <thrust/system/omp/detail/malloc_and_free.h>
+#include <thrust/system/tbb/detail/malloc_and_free.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/malloc_and_free.h>
 #include __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
 #undef __THRUST_HOST_SYSTEM_MALLOC_AND_FREE_HEADER
diff --git a/thrust/system/detail/adl/merge.h b/thrust/system/detail/adl/merge.h
index ac6b7f3e3..7abca9bcf 100644
--- a/thrust/system/detail/adl/merge.h
+++ b/thrust/system/detail/adl/merge.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/merge.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/merge.h>
+#include <thrust/system/cuda/detail/merge.h>
+#include <thrust/system/omp/detail/merge.h>
+#include <thrust/system/tbb/detail/merge.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_MERGE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/merge.h>
 #include __THRUST_HOST_SYSTEM_MERGE_HEADER
 #undef __THRUST_HOST_SYSTEM_MERGE_HEADER
diff --git a/thrust/system/detail/adl/mismatch.h b/thrust/system/detail/adl/mismatch.h
index 03b4e4abb..74feb8269 100644
--- a/thrust/system/detail/adl/mismatch.h
+++ b/thrust/system/detail/adl/mismatch.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/mismatch.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/mismatch.h>
+#include <thrust/system/cuda/detail/mismatch.h>
+#include <thrust/system/omp/detail/mismatch.h>
+#include <thrust/system/tbb/detail/mismatch.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_MISMATCH_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/mismatch.h>
 #include __THRUST_HOST_SYSTEM_MISMATCH_HEADER
 #undef __THRUST_HOST_SYSTEM_MISMATCH_HEADER
diff --git a/thrust/system/detail/adl/partition.h b/thrust/system/detail/adl/partition.h
index 1ce31b6d6..a45f845a5 100644
--- a/thrust/system/detail/adl/partition.h
+++ b/thrust/system/detail/adl/partition.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/partition.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/partition.h>
+#include <thrust/system/cuda/detail/partition.h>
+#include <thrust/system/omp/detail/partition.h>
+#include <thrust/system/tbb/detail/partition.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_PARTITION_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/partition.h>
 #include __THRUST_HOST_SYSTEM_PARTITION_HEADER
 #undef __THRUST_HOST_SYSTEM_PARTITION_HEADER
diff --git a/thrust/system/detail/adl/per_device_resource.h b/thrust/system/detail/adl/per_device_resource.h
new file mode 100644
index 000000000..721f49e03
--- /dev/null
+++ b/thrust/system/detail/adl/per_device_resource.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// the purpose of this header is to #include the per_device_resource.h header
+// of the sequential, host, and device systems. It should be #included in any
+// code which uses adl to dispatch per_device_resource
+
+#include <thrust/system/detail/sequential/per_device_resource.h>
+
+#if 0
+#include <thrust/system/cpp/detail/per_device_resource.h>
+#include <thrust/system/cuda/detail/per_device_resource.h>
+#include <thrust/system/omp/detail/per_device_resource.h>
+#include <thrust/system/tbb/detail/per_device_resource.h>
+#endif
+
+#define __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/per_device_resource.h>
+#include __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+#undef __THRUST_HOST_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+
+#define __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/detail/per_device_resource.h>
+#include __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+#undef __THRUST_DEVICE_SYSTEM_PER_DEVICE_RESOURCE_HEADER
+
diff --git a/thrust/system/detail/adl/reduce.h b/thrust/system/detail/adl/reduce.h
index 8bbe623b5..8a9673b3f 100644
--- a/thrust/system/detail/adl/reduce.h
+++ b/thrust/system/detail/adl/reduce.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/reduce.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reduce.h>
+#include <thrust/system/cuda/detail/reduce.h>
+#include <thrust/system/omp/detail/reduce.h>
+#include <thrust/system/tbb/detail/reduce.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce.h>
 #include __THRUST_HOST_SYSTEM_REDUCE_HEADER
 #undef __THRUST_HOST_SYSTEM_REDUCE_HEADER
diff --git a/thrust/system/detail/adl/reduce_by_key.h b/thrust/system/detail/adl/reduce_by_key.h
index 0ce1c78ec..0605f9bef 100644
--- a/thrust/system/detail/adl/reduce_by_key.h
+++ b/thrust/system/detail/adl/reduce_by_key.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/reduce_by_key.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reduce_by_key.h>
+#include <thrust/system/cuda/detail/reduce_by_key.h>
+#include <thrust/system/omp/detail/reduce_by_key.h>
+#include <thrust/system/tbb/detail/reduce_by_key.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reduce_by_key.h>
 #include __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
 #undef __THRUST_HOST_SYSTEM_REDUCE_BY_KEY_HEADER
diff --git a/thrust/system/detail/adl/remove.h b/thrust/system/detail/adl/remove.h
index 5aaf06280..c281379d5 100644
--- a/thrust/system/detail/adl/remove.h
+++ b/thrust/system/detail/adl/remove.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/remove.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/remove.h>
+#include <thrust/system/cuda/detail/remove.h>
+#include <thrust/system/omp/detail/remove.h>
+#include <thrust/system/tbb/detail/remove.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_REMOVE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/remove.h>
 #include __THRUST_HOST_SYSTEM_REMOVE_HEADER
 #undef __THRUST_HOST_SYSTEM_REMOVE_HEADER
diff --git a/thrust/system/detail/adl/replace.h b/thrust/system/detail/adl/replace.h
index 6a73c9c62..d8fb5746f 100644
--- a/thrust/system/detail/adl/replace.h
+++ b/thrust/system/detail/adl/replace.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/replace.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/replace.h>
+#include <thrust/system/cuda/detail/replace.h>
+#include <thrust/system/omp/detail/replace.h>
+#include <thrust/system/tbb/detail/replace.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_REPLACE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/replace.h>
 #include __THRUST_HOST_SYSTEM_REPLACE_HEADER
 #undef __THRUST_HOST_SYSTEM_REPLACE_HEADER
diff --git a/thrust/system/detail/adl/reverse.h b/thrust/system/detail/adl/reverse.h
index 64b2f8e28..f6bd8947e 100644
--- a/thrust/system/detail/adl/reverse.h
+++ b/thrust/system/detail/adl/reverse.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/reverse.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/reverse.h>
+#include <thrust/system/cuda/detail/reverse.h>
+#include <thrust/system/omp/detail/reverse.h>
+#include <thrust/system/tbb/detail/reverse.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_REVERSE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/reverse.h>
 #include __THRUST_HOST_SYSTEM_REVERSE_HEADER
 #undef __THRUST_HOST_SYSTEM_REVERSE_HEADER
diff --git a/thrust/system/detail/adl/scan.h b/thrust/system/detail/adl/scan.h
index a4ded752b..a24910410 100644
--- a/thrust/system/detail/adl/scan.h
+++ b/thrust/system/detail/adl/scan.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/scan.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scan.h>
+#include <thrust/system/cuda/detail/scan.h>
+#include <thrust/system/omp/detail/scan.h>
+#include <thrust/system/tbb/detail/scan.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan.h>
 #include __THRUST_HOST_SYSTEM_SCAN_HEADER
 #undef __THRUST_HOST_SYSTEM_SCAN_HEADER
diff --git a/thrust/system/detail/adl/scan_by_key.h b/thrust/system/detail/adl/scan_by_key.h
index d15351193..94f73503c 100644
--- a/thrust/system/detail/adl/scan_by_key.h
+++ b/thrust/system/detail/adl/scan_by_key.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/scan_by_key.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scan_by_key.h>
+#include <thrust/system/cuda/detail/scan_by_key.h>
+#include <thrust/system/omp/detail/scan_by_key.h>
+#include <thrust/system/tbb/detail/scan_by_key.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scan_by_key.h>
 #include __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
 #undef __THRUST_HOST_SYSTEM_SCAN_BY_KEY_HEADER
diff --git a/thrust/system/detail/adl/scatter.h b/thrust/system/detail/adl/scatter.h
index 064bca452..d9f42b28b 100644
--- a/thrust/system/detail/adl/scatter.h
+++ b/thrust/system/detail/adl/scatter.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/scatter.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/scatter.h>
+#include <thrust/system/cuda/detail/scatter.h>
+#include <thrust/system/omp/detail/scatter.h>
+#include <thrust/system/tbb/detail/scatter.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SCATTER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/scatter.h>
 #include __THRUST_HOST_SYSTEM_SCATTER_HEADER
 #undef __THRUST_HOST_SYSTEM_SCATTER_HEADER
diff --git a/thrust/system/detail/adl/sequence.h b/thrust/system/detail/adl/sequence.h
index 7d580a7f5..d3c2a20f4 100644
--- a/thrust/system/detail/adl/sequence.h
+++ b/thrust/system/detail/adl/sequence.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/sequence.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/sequence.h>
+#include <thrust/system/cuda/detail/sequence.h>
+#include <thrust/system/omp/detail/sequence.h>
+#include <thrust/system/tbb/detail/sequence.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SEQUENCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sequence.h>
 #include __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
 #undef __THRUST_HOST_SYSTEM_SEQUENCE_HEADER
diff --git a/thrust/system/detail/adl/set_operations.h b/thrust/system/detail/adl/set_operations.h
index 9917fbed6..7d09355e1 100644
--- a/thrust/system/detail/adl/set_operations.h
+++ b/thrust/system/detail/adl/set_operations.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/set_operations.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/set_operations.h>
+#include <thrust/system/cuda/detail/set_operations.h>
+#include <thrust/system/omp/detail/set_operations.h>
+#include <thrust/system/tbb/detail/set_operations.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/set_operations.h>
 #include __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
 #undef __THRUST_HOST_SYSTEM_SET_OPERATIONS_HEADER
diff --git a/thrust/system/detail/adl/sort.h b/thrust/system/detail/adl/sort.h
index e45e162e6..1f6118c90 100644
--- a/thrust/system/detail/adl/sort.h
+++ b/thrust/system/detail/adl/sort.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/sort.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/sort.h>
+#include <thrust/system/cuda/detail/sort.h>
+#include <thrust/system/omp/detail/sort.h>
+#include <thrust/system/tbb/detail/sort.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SORT_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/sort.h>
 #include __THRUST_HOST_SYSTEM_SORT_HEADER
 #undef __THRUST_HOST_SYSTEM_SORT_HEADER
diff --git a/thrust/system/detail/adl/swap_ranges.h b/thrust/system/detail/adl/swap_ranges.h
index e053e3b8e..1ca3719d9 100644
--- a/thrust/system/detail/adl/swap_ranges.h
+++ b/thrust/system/detail/adl/swap_ranges.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/swap_ranges.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/swap_ranges.h>
+#include <thrust/system/cuda/detail/swap_ranges.h>
+#include <thrust/system/omp/detail/swap_ranges.h>
+#include <thrust/system/tbb/detail/swap_ranges.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/swap_ranges.h>
 #include __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
 #undef __THRUST_HOST_SYSTEM_SWAP_RANGES_HEADER
diff --git a/thrust/system/detail/adl/tabulate.h b/thrust/system/detail/adl/tabulate.h
index 5f7b3de6e..6ae2b22a5 100644
--- a/thrust/system/detail/adl/tabulate.h
+++ b/thrust/system/detail/adl/tabulate.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/tabulate.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/tabulate.h>
+#include <thrust/system/cuda/detail/tabulate.h>
+#include <thrust/system/omp/detail/tabulate.h>
+#include <thrust/system/tbb/detail/tabulate.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_TABULATE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/tabulate.h>
 #include __THRUST_HOST_SYSTEM_TABULATE_HEADER
 #undef __THRUST_HOST_SYSTEM_TABULATE_HEADER
diff --git a/thrust/system/detail/adl/temporary_buffer.h b/thrust/system/detail/adl/temporary_buffer.h
index 60f2613c6..0cada5ee4 100644
--- a/thrust/system/detail/adl/temporary_buffer.h
+++ b/thrust/system/detail/adl/temporary_buffer.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/temporary_buffer.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/temporary_buffer.h>
+#include <thrust/system/cuda/detail/temporary_buffer.h>
+#include <thrust/system/omp/detail/temporary_buffer.h>
+#include <thrust/system/tbb/detail/temporary_buffer.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/temporary_buffer.h>
 #include __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
 #undef __THRUST_HOST_SYSTEM_TEMPORARY_BUFFER_HEADER
diff --git a/thrust/system/detail/adl/transform.h b/thrust/system/detail/adl/transform.h
index a7edeb16e..b70333093 100644
--- a/thrust/system/detail/adl/transform.h
+++ b/thrust/system/detail/adl/transform.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/transform.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform.h>
+#include <thrust/system/cuda/detail/transform.h>
+#include <thrust/system/omp/detail/transform.h>
+#include <thrust/system/tbb/detail/transform.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_TRANSFORM_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform.h>
 #include __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
 #undef __THRUST_HOST_SYSTEM_TRANSFORM_HEADER
diff --git a/thrust/system/detail/adl/transform_reduce.h b/thrust/system/detail/adl/transform_reduce.h
index d2eba6b4c..e3f9494df 100644
--- a/thrust/system/detail/adl/transform_reduce.h
+++ b/thrust/system/detail/adl/transform_reduce.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/transform_reduce.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform_reduce.h>
+#include <thrust/system/cuda/detail/transform_reduce.h>
+#include <thrust/system/omp/detail/transform_reduce.h>
+#include <thrust/system/tbb/detail/transform_reduce.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_reduce.h>
 #include __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
 #undef __THRUST_HOST_SYSTEM_TRANSFORM_REDUCE_HEADER
diff --git a/thrust/system/detail/adl/transform_scan.h b/thrust/system/detail/adl/transform_scan.h
index 80d0ae2c7..3a05c7eee 100644
--- a/thrust/system/detail/adl/transform_scan.h
+++ b/thrust/system/detail/adl/transform_scan.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/transform_scan.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/transform_scan.h>
+#include <thrust/system/cuda/detail/transform_scan.h>
+#include <thrust/system/omp/detail/transform_scan.h>
+#include <thrust/system/tbb/detail/transform_scan.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/transform_scan.h>
 #include __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
 #undef __THRUST_HOST_SYSTEM_TRANSFORM_SCAN_HEADER
diff --git a/thrust/system/detail/adl/uninitialized_copy.h b/thrust/system/detail/adl/uninitialized_copy.h
index db341ed3b..a13b18aa8 100644
--- a/thrust/system/detail/adl/uninitialized_copy.h
+++ b/thrust/system/detail/adl/uninitialized_copy.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/uninitialized_copy.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/uninitialized_copy.h>
+#include <thrust/system/cuda/detail/uninitialized_copy.h>
+#include <thrust/system/omp/detail/uninitialized_copy.h>
+#include <thrust/system/tbb/detail/uninitialized_copy.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_copy.h>
 #include __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
 #undef __THRUST_HOST_SYSTEM_UNINITIALIZED_COPY_HEADER
diff --git a/thrust/system/detail/adl/uninitialized_fill.h b/thrust/system/detail/adl/uninitialized_fill.h
index 045b86f54..98b57836e 100644
--- a/thrust/system/detail/adl/uninitialized_fill.h
+++ b/thrust/system/detail/adl/uninitialized_fill.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/uninitialized_fill.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/uninitialized_fill.h>
+#include <thrust/system/cuda/detail/uninitialized_fill.h>
+#include <thrust/system/omp/detail/uninitialized_fill.h>
+#include <thrust/system/tbb/detail/uninitialized_fill.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/uninitialized_fill.h>
 #include __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
 #undef __THRUST_HOST_SYSTEM_UNINITIALIZED_FILL_HEADER
diff --git a/thrust/system/detail/adl/unique.h b/thrust/system/detail/adl/unique.h
index 9f2b0692c..4082f5299 100644
--- a/thrust/system/detail/adl/unique.h
+++ b/thrust/system/detail/adl/unique.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/unique.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/unique.h>
+#include <thrust/system/cuda/detail/unique.h>
+#include <thrust/system/omp/detail/unique.h>
+#include <thrust/system/tbb/detail/unique.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_UNIQUE_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique.h>
 #include __THRUST_HOST_SYSTEM_UNIQUE_HEADER
 #undef __THRUST_HOST_SYSTEM_UNIQUE_HEADER
diff --git a/thrust/system/detail/adl/unique_by_key.h b/thrust/system/detail/adl/unique_by_key.h
index 685d8df62..dcf9acd42 100644
--- a/thrust/system/detail/adl/unique_by_key.h
+++ b/thrust/system/detail/adl/unique_by_key.h
@@ -24,6 +24,16 @@
 
 #include <thrust/system/detail/sequential/unique_by_key.h>
 
+// SCons can't see through the #defines below to figure out what this header
+// includes, so we fake it out by specifying all possible files we might end up
+// including inside an #if 0.
+#if 0
+#include <thrust/system/cpp/detail/unique_by_key.h>
+#include <thrust/system/cuda/detail/unique_by_key.h>
+#include <thrust/system/omp/detail/unique_by_key.h>
+#include <thrust/system/tbb/detail/unique_by_key.h>
+#endif
+
 #define __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER <__THRUST_HOST_SYSTEM_ROOT/detail/unique_by_key.h>
 #include __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
 #undef __THRUST_HOST_SYSTEM_UNIQUE_BY_KEY_HEADER
diff --git a/thrust/system/detail/bad_alloc.h b/thrust/system/detail/bad_alloc.h
index 461704fd6..ae5dd5994 100644
--- a/thrust/system/detail/bad_alloc.h
+++ b/thrust/system/detail/bad_alloc.h
@@ -20,8 +20,9 @@
 #include <new>
 #include <string>
 
-namespace thrust
-{
+#include <thrust/detail/config.h>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,9 +41,9 @@ class bad_alloc
       m_what += w;
     } // end bad_alloc()
 
-    inline virtual ~bad_alloc(void) throw () {};
+    inline virtual ~bad_alloc(void) noexcept {};
 
-    inline virtual const char *what(void) const throw()
+    inline virtual const char *what(void) const noexcept
     {
       return m_what.c_str();
     } // end what()
@@ -53,5 +54,5 @@ class bad_alloc
   
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/errno.h b/thrust/system/detail/errno.h
index 78aec2ace..69cb2bd98 100644
--- a/thrust/system/detail/errno.h
+++ b/thrust/system/detail/errno.h
@@ -24,8 +24,7 @@
 // pollute the global namespace. These identifiers are in lowercase to avoid
 // colliding with the real macros in errno.h.
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -116,5 +115,5 @@ static const int emlink          = 9979;
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_category.inl b/thrust/system/detail/error_category.inl
index 949e7c5d5..45fd15a3f 100644
--- a/thrust/system/detail/error_category.inl
+++ b/thrust/system/detail/error_category.inl
@@ -17,13 +17,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/error_code.h>
 #include <thrust/system/detail/errno.h>
 #include <thrust/functional.h>
 #include <cstring>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -99,7 +100,9 @@ class generic_error_category
 
       // XXX strerror is not thread-safe:
       //     prefer strerror_r (which is not provided on windows)
+      THRUST_DISABLE_MSVC_WARNING_BEGIN(4996)
       const char *c_str = std::strerror(ev);
+      THRUST_DISABLE_MSVC_WARNING_END(4996)
       return c_str ? std::string(c_str) : unknown_err;
     }
 }; // end generic_category_result
@@ -230,5 +233,5 @@ const error_category &system_category(void)
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_code.inl b/thrust/system/detail/error_code.inl
index 6631f486f..2b819c048 100644
--- a/thrust/system/detail/error_code.inl
+++ b/thrust/system/detail/error_code.inl
@@ -17,10 +17,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/error_code.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -193,5 +194,5 @@ bool operator!=(const error_condition &lhs, const error_condition &rhs)
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/error_condition.inl b/thrust/system/detail/error_condition.inl
index 9dc493bcc..0daf1f293 100644
--- a/thrust/system/detail/error_condition.inl
+++ b/thrust/system/detail/error_condition.inl
@@ -17,11 +17,12 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/detail/error_condition.inl>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -129,5 +130,5 @@ bool operator<(const error_condition &lhs,
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/adjacent_difference.h b/thrust/system/detail/generic/adjacent_difference.h
index 6e4caaa88..43592e15b 100644
--- a/thrust/system/detail/generic/adjacent_difference.h
+++ b/thrust/system/detail/generic/adjacent_difference.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,7 +51,7 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/adjacent_difference.inl>
 
diff --git a/thrust/system/detail/generic/adjacent_difference.inl b/thrust/system/detail/generic/adjacent_difference.inl
index ad4ad1cd4..504129328 100644
--- a/thrust/system/detail/generic/adjacent_difference.inl
+++ b/thrust/system/detail/generic/adjacent_difference.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 #include <thrust/adjacent_difference.h>
@@ -22,8 +24,7 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/transform.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,17 +58,17 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
   if(first == last)
   {
     // empty range, nothing to do
-    return result; 
+    return result;
   }
-  else 
+  else
   {
     // an in-place operation is requested, copy the input and call the entry point
     // XXX a special-purpose kernel would be faster here since
     // only block boundaries need to be copied
     thrust::detail::temporary_array<InputType, DerivedPolicy> input_copy(exec, first, last);
-    
+
     *result = *first;
-    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op); 
+    thrust::transform(exec, input_copy.begin() + 1, input_copy.end(), input_copy.begin(), result + 1, binary_op);
   }
 
   return result + (last - first);
@@ -77,5 +78,5 @@ OutputIterator adjacent_difference(thrust::execution_policy<DerivedPolicy> &exec
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/advance.h b/thrust/system/detail/generic/advance.h
index f9cab587b..4d6562e00 100644
--- a/thrust/system/detail/generic/advance.h
+++ b/thrust/system/detail/generic/advance.h
@@ -19,8 +19,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,7 +34,7 @@ void advance(InputIterator& i, Distance n);
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/advance.inl>
 
diff --git a/thrust/system/detail/generic/advance.inl b/thrust/system/detail/generic/advance.inl
index ae98d596b..21555ebb0 100644
--- a/thrust/system/detail/generic/advance.inl
+++ b/thrust/system/detail/generic/advance.inl
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/advance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -63,5 +64,5 @@ void advance(InputIterator& i, Distance n)
 } // end namespace detail
 } // end namespace generic
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/binary_search.h b/thrust/system/detail/generic/binary_search.h
index 8cd85c63f..6603f6c30 100644
--- a/thrust/system/detail/generic/binary_search.h
+++ b/thrust/system/detail/generic/binary_search.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -168,7 +167,7 @@ equal_range(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/binary_search.inl>
 
diff --git a/thrust/system/detail/generic/binary_search.inl b/thrust/system/detail/generic/binary_search.inl
index 143d8659f..bc60bb8e5 100644
--- a/thrust/system/detail/generic/binary_search.inl
+++ b/thrust/system/detail/generic/binary_search.inl
@@ -14,11 +14,6 @@
  *  limitations under the License.
  */
 
-
-/*! \file binary_search.inl
- *  \brief Inline file for binary_search.h
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -31,12 +26,12 @@
 #include <thrust/for_each.h>
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/generic/scalar/binary_search.h>
+#include <thrust/system/detail/generic/select_system.h>
 
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -88,9 +83,9 @@ struct bsf
   bool operator()(RandomAccessIterator begin, RandomAccessIterator end, const T& value, StrictWeakOrdering comp)
   {
     RandomAccessIterator iter = thrust::system::detail::generic::scalar::lower_bound(begin, end, value, comp);
-    
+
     thrust::detail::wrapped_function<StrictWeakOrdering,bool> wrapped_comp(comp);
-    
+
     return iter != end && !wrapped_comp(value, *iter);
   }
 };
@@ -103,11 +98,11 @@ struct binary_search_functor
   ForwardIterator end;
   StrictWeakOrdering comp;
   BinarySearchFunction func;
-  
+
   __host__ __device__
   binary_search_functor(ForwardIterator begin, ForwardIterator end, StrictWeakOrdering comp, BinarySearchFunction func)
     : begin(begin), end(end), comp(comp), func(func) {}
-  
+
   template<typename Tuple>
   __host__ __device__
   void operator()(Tuple t)
@@ -121,9 +116,9 @@ struct binary_search_functor
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering, typename BinarySearchFunction>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output,
                              StrictWeakOrdering comp,
@@ -133,11 +128,11 @@ OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                    thrust::make_zip_iterator(thrust::make_tuple(values_begin, output)),
                    thrust::make_zip_iterator(thrust::make_tuple(values_end, output + thrust::distance(values_begin, values_end))),
                    detail::binary_search_functor<ForwardIterator, StrictWeakOrdering, BinarySearchFunction>(begin, end, comp, func));
-  
+
   return output + thrust::distance(values_begin, values_end);
 }
 
-   
+
 
 // Scalar Implementation
 template<typename OutputType, typename DerivedPolicy, typename ForwardIterator, typename T, typename StrictWeakOrdering, typename BinarySearchFunction>
@@ -145,24 +140,39 @@ __host__ __device__
 OutputType binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                          ForwardIterator begin,
                          ForwardIterator end,
-                         const T& value, 
+                         const T& value,
                          StrictWeakOrdering comp,
                          BinarySearchFunction func)
 {
   // use the vectorized path to implement the scalar version
-  
+
   // allocate device buffers for value and output
   thrust::detail::temporary_array<T,DerivedPolicy>          d_value(exec,1);
   thrust::detail::temporary_array<OutputType,DerivedPolicy> d_output(exec,1);
-  
-  // copy value to device
-  d_value[0] = value;
-  
+
+  { // copy value to device
+    typedef typename thrust::iterator_system<const T*>::type value_in_system_t;
+    value_in_system_t value_in_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(value_in_system)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(exec))),
+                   &value, 1, d_value.begin());
+  }
+
   // perform the query
   thrust::system::detail::generic::detail::binary_search(exec, begin, end, d_value.begin(), d_value.end(), d_output.begin(), comp, func);
-  
-  // copy result to host and return
-  return d_output[0];
+
+  OutputType output;
+  { // copy result to host and return
+    typedef typename thrust::iterator_system<OutputType*>::type result_out_system_t;
+    result_out_system_t result_out_system;
+    using thrust::system::detail::generic::select_system;
+    thrust::copy_n(select_system(thrust::detail::derived_cast(thrust::detail::strip_const(exec)),
+                                 thrust::detail::derived_cast(thrust::detail::strip_const(result_out_system))),
+                   d_output.begin(), 1, &output);
+  }
+
+  return output;
 }
 
 
@@ -180,7 +190,7 @@ struct binary_search_less
   }
 };
 
-   
+
 } // end namespace detail
 
 
@@ -205,11 +215,11 @@ __host__ __device__
 ForwardIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
                             ForwardIterator begin,
                             ForwardIterator end,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
+
   return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::lbf());
 }
 
@@ -231,11 +241,11 @@ __host__ __device__
 ForwardIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
                             ForwardIterator begin,
                             ForwardIterator end,
-                            const T& value, 
+                            const T& value,
                             StrictWeakOrdering comp)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::difference_type difference_type;
-  
+
   return begin + detail::binary_search<difference_type>(exec, begin, end, value, comp, detail::ubf());
 }
 
@@ -256,7 +266,7 @@ __host__ __device__
 bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
                    ForwardIterator begin,
                    ForwardIterator end,
-                   const T& value, 
+                   const T& value,
                    StrictWeakOrdering comp)
 {
   return detail::binary_search<bool>(exec, begin, end, value, comp, detail::bsf());
@@ -271,9 +281,9 @@ bool binary_search(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output)
 {
@@ -285,9 +295,9 @@ OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -299,9 +309,9 @@ OutputIterator lower_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output)
 {
@@ -313,9 +323,9 @@ OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
-                           ForwardIterator begin, 
+                           ForwardIterator begin,
                            ForwardIterator end,
-                           InputIterator values_begin, 
+                           InputIterator values_begin,
                            InputIterator values_end,
                            OutputIterator output,
                            StrictWeakOrdering comp)
@@ -327,9 +337,9 @@ OutputIterator upper_bound(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output)
 {
@@ -341,9 +351,9 @@ OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
 template<typename DerivedPolicy, typename ForwardIterator, typename InputIterator, typename OutputIterator, typename StrictWeakOrdering>
 __host__ __device__
 OutputIterator binary_search(thrust::execution_policy<DerivedPolicy> &exec,
-                             ForwardIterator begin, 
+                             ForwardIterator begin,
                              ForwardIterator end,
-                             InputIterator values_begin, 
+                             InputIterator values_begin,
                              InputIterator values_end,
                              OutputIterator output,
                              StrictWeakOrdering comp)
@@ -382,5 +392,5 @@ equal_range(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/copy.h b/thrust/system/detail/generic/copy.h
index e22535618..36ac71899 100644
--- a/thrust/system/detail/generic/copy.h
+++ b/thrust/system/detail/generic/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/copy.inl>
 
diff --git a/thrust/system/detail/generic/copy.inl b/thrust/system/detail/generic/copy.inl
index 9763a0682..34d66baa6 100644
--- a/thrust/system/detail/generic/copy.inl
+++ b/thrust/system/detail/generic/copy.inl
@@ -26,8 +26,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/iterator/detail/minimum_system.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -77,5 +76,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/copy_if.h b/thrust/system/detail/generic/copy_if.h
index 6e3fb73a6..6a13edfda 100644
--- a/thrust/system/detail/generic/copy_if.h
+++ b/thrust/system/detail/generic/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/copy_if.inl>
 
diff --git a/thrust/system/detail/generic/copy_if.inl b/thrust/system/detail/generic/copy_if.inl
index f2968a561..5a6edd72e 100644
--- a/thrust/system/detail/generic/copy_if.inl
+++ b/thrust/system/detail/generic/copy_if.inl
@@ -32,8 +32,7 @@
 #include <thrust/scatter.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ OutputIterator copy_if(thrust::execution_policy<DerivedPolicy> &exec,
                        OutputIterator result,
                        Predicate pred)
 {
-  __THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(IndexType n = thrust::distance(first, last));
+  THRUST_DISABLE_MSVC_POSSIBLE_LOSS_OF_DATA_WARNING(IndexType n = thrust::distance(first, last));
   
   // compute {0,1} predicates
   thrust::detail::temporary_array<IndexType, DerivedPolicy> predicates(exec, n);
@@ -157,5 +156,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/count.h b/thrust/system/detail/generic/count.h
index 218369e38..295d36e6b 100644
--- a/thrust/system/detail/generic/count.h
+++ b/thrust/system/detail/generic/count.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,7 +44,7 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/count.inl>
 
diff --git a/thrust/system/detail/generic/count.inl b/thrust/system/detail/generic/count.inl
index d9e1039e8..dafc1c1df 100644
--- a/thrust/system/detail/generic/count.inl
+++ b/thrust/system/detail/generic/count.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/count.h>
 #include <thrust/transform_reduce.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -32,7 +33,7 @@ namespace generic
 template <typename InputType, typename Predicate, typename CountType>
 struct count_if_transform
 {
-  __host__ __device__ 
+  __host__ __device__
   count_if_transform(Predicate _pred) : pred(_pred){}
 
   __thrust_exec_check_disable__
@@ -54,8 +55,9 @@ __host__ __device__
 typename thrust::iterator_traits<InputIterator>::difference_type
 count(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, InputIterator last, const EqualityComparable& value)
 {
-  // XXX use placeholder expression here
-  return thrust::count_if(exec, first, last, thrust::detail::equal_to_value<EqualityComparable>(value));
+  using thrust::placeholders::_1;
+
+  return thrust::count_if(exec, first, last, _1 == value);
 } // end count()
 
 
@@ -66,7 +68,7 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 {
   typedef typename thrust::iterator_traits<InputIterator>::value_type InputType;
   typedef typename thrust::iterator_traits<InputIterator>::difference_type CountType;
-  
+
   thrust::system::detail::generic::count_if_transform<InputType, Predicate, CountType> unary_op(pred);
   thrust::plus<CountType> binary_op;
   return thrust::transform_reduce(exec, first, last, unary_op, CountType(0), binary_op);
@@ -76,5 +78,5 @@ count_if(thrust::execution_policy<DerivedPolicy> &exec, InputIterator first, Inp
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/distance.h b/thrust/system/detail/generic/distance.h
index 03b0fb556..4627376b5 100644
--- a/thrust/system/detail/generic/distance.h
+++ b/thrust/system/detail/generic/distance.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -37,7 +36,7 @@ inline __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/distance.inl>
 
diff --git a/thrust/system/detail/generic/distance.inl b/thrust/system/detail/generic/distance.inl
index 5cc697200..46bad7ba7 100644
--- a/thrust/system/detail/generic/distance.inl
+++ b/thrust/system/detail/generic/distance.inl
@@ -14,12 +14,13 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -60,7 +61,7 @@ inline __host__ __device__
 
 } // end detail
 
-
+__thrust_exec_check_disable__
 template<typename InputIterator>
 inline __host__ __device__
   typename thrust::iterator_traits<InputIterator>::difference_type
@@ -75,5 +76,5 @@ inline __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/equal.h b/thrust/system/detail/generic/equal.h
index 8962b1bd1..4afd88d00 100644
--- a/thrust/system/detail/generic/equal.h
+++ b/thrust/system/detail/generic/equal.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,7 +41,7 @@ bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/equal.inl>
 
diff --git a/thrust/system/detail/generic/equal.inl b/thrust/system/detail/generic/equal.inl
index 7c9dec4bc..c023070cd 100644
--- a/thrust/system/detail/generic/equal.inl
+++ b/thrust/system/detail/generic/equal.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/equal.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/mismatch.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,7 +36,7 @@ __host__ __device__
 bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2)
 {
   typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  
+
   return thrust::equal(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
 }
 
@@ -54,5 +55,5 @@ bool equal(thrust::execution_policy<DerivedPolicy> &exec, InputIterator1 first1,
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/extrema.h b/thrust/system/detail/generic/extrema.h
index a3ee81889..e3b447958 100644
--- a/thrust/system/detail/generic/extrema.h
+++ b/thrust/system/detail/generic/extrema.h
@@ -25,8 +25,7 @@
 #include <thrust/pair.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -83,7 +82,7 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_p
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/extrema.inl>
 
diff --git a/thrust/system/detail/generic/extrema.inl b/thrust/system/detail/generic/extrema.inl
index d80773ef7..744d137de 100644
--- a/thrust/system/detail/generic/extrema.inl
+++ b/thrust/system/detail/generic/extrema.inl
@@ -22,6 +22,7 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/get_iterator_value.h>
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
 #include <thrust/pair.h>
@@ -32,8 +33,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -172,7 +172,7 @@ ForwardIterator min_element(thrust::execution_policy<DerivedPolicy> &exec,
       (exec,
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
-       thrust::tuple<InputType, IndexType>(get_value(derived_cast(exec), &first[0]), 0),
+       thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec), first), 0),
        detail::min_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
 
   return first + thrust::get<1>(result);
@@ -209,7 +209,7 @@ ForwardIterator max_element(thrust::execution_policy<DerivedPolicy> &exec,
       (exec,
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
-       thrust::tuple<InputType, IndexType>(get_value(derived_cast(exec),&first[0]), 0),
+       thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec),first), 0),
        detail::max_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
 
   return first + thrust::get<1>(result);
@@ -247,7 +247,8 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_p
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))),
        thrust::make_zip_iterator(thrust::make_tuple(first, thrust::counting_iterator<IndexType>(0))) + (last - first),
        detail::duplicate_tuple<InputType, IndexType>(),
-       detail::duplicate_tuple<InputType, IndexType>()(thrust::tuple<InputType, IndexType>(get_value(derived_cast(exec),&first[0]), 0)),
+       detail::duplicate_tuple<InputType, IndexType>()(
+         thrust::tuple<InputType, IndexType>(thrust::detail::get_iterator_value(derived_cast(exec),first), 0)),
        detail::minmax_element_reduction<InputType, IndexType, BinaryPredicate>(comp));
 
   return thrust::make_pair(first + thrust::get<1>(thrust::get<0>(result)), first + thrust::get<1>(thrust::get<1>(result)));
@@ -257,5 +258,5 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(thrust::execution_p
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/fill.h b/thrust/system/detail/generic/fill.h
index 6c4f2ed4e..5a881359b 100644
--- a/thrust/system/detail/generic/fill.h
+++ b/thrust/system/detail/generic/fill.h
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/detail/internal_functional.h>
 #include <thrust/generate.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -56,5 +57,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/find.h b/thrust/system/detail/generic/find.h
index 00e11e53c..6db441d02 100644
--- a/thrust/system/detail/generic/find.h
+++ b/thrust/system/detail/generic/find.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,7 +56,7 @@ InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/find.inl>
 
diff --git a/thrust/system/detail/generic/find.inl b/thrust/system/detail/generic/find.inl
index 9414fc615..8bd619561 100644
--- a/thrust/system/detail/generic/find.inl
+++ b/thrust/system/detail/generic/find.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/find.h>
 #include <thrust/reduce.h>
@@ -28,8 +30,7 @@
 
 // Contributed by Erich Elsen
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,8 +46,9 @@ InputIterator find(thrust::execution_policy<DerivedPolicy> &exec,
                    InputIterator last,
                    const T& value)
 {
-  // XXX consider a placeholder expression here
-  return thrust::find_if(exec, first, last, thrust::detail::equal_to_value<T>(value));
+  using thrust::placeholders::_1;
+
+  return thrust::find_if(exec, first, last, _1 == value);
 } // end find()
 
 
@@ -71,7 +73,7 @@ struct find_if_functor
     }
   }
 };
-    
+
 
 template<typename DerivedPolicy, typename InputIterator, typename Predicate>
 __host__ __device__
@@ -82,30 +84,30 @@ InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
 {
   typedef typename thrust::iterator_traits<InputIterator>::difference_type difference_type;
   typedef typename thrust::tuple<bool,difference_type> result_type;
-  
+
   // empty sequence
   if(first == last) return last;
-  
+
   const difference_type n = thrust::distance(first, last);
-  
+
   // this implementation breaks up the sequence into separate intervals
   // in an attempt to early-out as soon as a value is found
-  
+
   // TODO incorporate sizeof(InputType) into interval_threshold and round to multiple of 32
   const difference_type interval_threshold = 1 << 20;
   const difference_type interval_size = (thrust::min)(interval_threshold, n);
-  
+
   // force transform_iterator output to bool
   typedef thrust::transform_iterator<Predicate, InputIterator, bool> XfrmIterator;
   typedef thrust::tuple<XfrmIterator, thrust::counting_iterator<difference_type> > IteratorTuple;
   typedef thrust::zip_iterator<IteratorTuple> ZipIterator;
-  
+
   IteratorTuple iter_tuple = thrust::make_tuple(XfrmIterator(first, pred),
                                                 thrust::counting_iterator<difference_type>(0));
-  
+
   ZipIterator begin = thrust::make_zip_iterator(iter_tuple);
   ZipIterator end   = begin + n;
-  
+
   for(ZipIterator interval_begin = begin; interval_begin < end; interval_begin += interval_size)
   {
     ZipIterator interval_end = interval_begin + interval_size;
@@ -113,19 +115,19 @@ InputIterator find_if(thrust::execution_policy<DerivedPolicy> &exec,
     {
       interval_end = end;
     } // end if
-    
+
     result_type result = thrust::reduce(exec,
                                         interval_begin, interval_end,
                                         result_type(false,interval_end - begin),
                                         find_if_functor<result_type>());
-    
+
     // see if we found something
     if(thrust::get<0>(result))
     {
       return first + thrust::get<1>(result);
     }
   }
-  
+
   //nothing was found if we reach here...
   return first + n;
 }
@@ -145,5 +147,5 @@ InputIterator find_if_not(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/for_each.h b/thrust/system/detail/generic/for_each.h
index a8c79b76d..3c2ec12cd 100644
--- a/thrust/system/detail/generic/for_each.h
+++ b/thrust/system/detail/generic/for_each.h
@@ -26,8 +26,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/detail/static_assert.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,13 +39,15 @@ template<typename DerivedPolicy,
          typename InputIterator,
          typename UnaryFunction>
 __host__ __device__
-InputIterator for_each(thrust::execution_policy<DerivedPolicy> &exec,
+InputIterator for_each(thrust::execution_policy<DerivedPolicy> &,
                        InputIterator first,
-                       InputIterator last,
-                       UnaryFunction f)
+                       InputIterator ,
+                       UnaryFunction )
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
   return first;
 } // end for_each()
 
@@ -56,13 +57,15 @@ template<typename DerivedPolicy,
          typename Size,
          typename UnaryFunction>
 __host__ __device__
-InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &exec,
+InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &,
                          InputIterator first,
-                         Size n,
-                         UnaryFunction f)
+                         Size ,
+                         UnaryFunction )
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
   return first;
 } // end for_each_n()
 
@@ -70,5 +73,5 @@ InputIterator for_each_n(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/gather.h b/thrust/system/detail/generic/gather.h
index d587572f0..5b6b41831 100644
--- a/thrust/system/detail/generic/gather.h
+++ b/thrust/system/detail/generic/gather.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -75,7 +74,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/gather.inl>
 
diff --git a/thrust/system/detail/generic/gather.inl b/thrust/system/detail/generic/gather.inl
index 4f4289ecb..7ab550edf 100644
--- a/thrust/system/detail/generic/gather.inl
+++ b/thrust/system/detail/generic/gather.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/gather.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -103,5 +104,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/generate.h b/thrust/system/detail/generic/generate.h
index edc2cc5eb..a9846c5be 100644
--- a/thrust/system/detail/generic/generate.h
+++ b/thrust/system/detail/generic/generate.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/generate.inl>
 
diff --git a/thrust/system/detail/generic/generate.inl b/thrust/system/detail/generic/generate.inl
index 1cd335853..869e0f32b 100644
--- a/thrust/system/detail/generic/generate.inl
+++ b/thrust/system/detail/generic/generate.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/generate.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -38,6 +39,25 @@ __host__ __device__
                 ForwardIterator last,
                 Generator gen)
 {
+  // this static assert is necessary due to a workaround in generate_functor
+  // it takes a const reference to accept temporaries from proxy iterators
+  // and then const_casts the constness away
+  //
+  // this had the weird side effect of allowing generate (and fill, and whatever
+  // else is implemented in terms of generate) to fill through const iterators.
+  // this might become unnecessary once Thrust is C++11-and-above only, since the
+  // other solution is to take an rvalue reference in a second overload of
+  // operator() of the function object, but until we support pre-11, this is a
+  // nice solution that validates the const_cast and doesn't take away any
+  // functionality.
+  THRUST_STATIC_ASSERT_MSG(
+    !thrust::detail::is_const<
+      typename thrust::detail::remove_reference<
+        typename thrust::iterator_traits<ForwardIterator>::reference
+      >::type
+    >::value
+  , "generating to `const` iterators is not allowed"
+  );
   thrust::for_each(exec, first, last, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
 } // end generate()
 
@@ -51,11 +71,30 @@ __host__ __device__
                             Size n,
                             Generator gen)
 {
+  // this static assert is necessary due to a workaround in generate_functor
+  // it takes a const reference to accept temporaries from proxy iterators
+  // and then const_casts the constness away
+  //
+  // this had the weird side effect of allowing generate (and fill, and whatever
+  // else is implemented in terms of generate) to fill through const iterators.
+  // this might become unnecessary once Thrust is C++11-and-above only, since the
+  // other solution is to take an rvalue reference in a second overload of
+  // operator() of the function object, but until we support pre-11, this is a
+  // nice solution that validates the const_cast and doesn't take away any
+  // functionality.
+  THRUST_STATIC_ASSERT_MSG(
+    !thrust::detail::is_const<
+      typename thrust::detail::remove_reference<
+        typename thrust::iterator_traits<OutputIterator>::reference
+      >::type
+    >::value
+  , "generating to `const` iterators is not allowed"
+  );
   return thrust::for_each_n(exec, first, n, typename thrust::detail::generate_functor<ExecutionPolicy,Generator>::type(gen));
 } // end generate()
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/inner_product.h b/thrust/system/detail/generic/inner_product.h
index 71e1a9270..62d10d31f 100644
--- a/thrust/system/detail/generic/inner_product.h
+++ b/thrust/system/detail/generic/inner_product.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/inner_product.inl>
 
diff --git a/thrust/system/detail/generic/inner_product.inl b/thrust/system/detail/generic/inner_product.inl
index 0a50386be..5055ec10f 100644
--- a/thrust/system/detail/generic/inner_product.inl
+++ b/thrust/system/detail/generic/inner_product.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/inner_product.h>
 #include <thrust/functional.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/transform_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +51,7 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
                          InputIterator1 first1,
                          InputIterator1 last1,
                          InputIterator2 first2,
-                         OutputType init, 
+                         OutputType init,
                          BinaryFunction1 binary_op1,
                          BinaryFunction2 binary_op2)
 {
@@ -68,5 +69,5 @@ OutputType inner_product(thrust::execution_policy<DerivedPolicy> &exec,
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/logical.h b/thrust/system/detail/generic/logical.h
index 702dbad85..e261154e2 100644
--- a/thrust/system/detail/generic/logical.h
+++ b/thrust/system/detail/generic/logical.h
@@ -22,8 +22,7 @@
 #include <thrust/find.h>
 #include <thrust/logical.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -59,5 +58,5 @@ bool none_of(thrust::execution_policy<ExecutionPolicy> &exec, InputIterator firs
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/memory.h b/thrust/system/detail/generic/memory.h
index acef823d8..675cc7302 100644
--- a/thrust/system/detail/generic/memory.h
+++ b/thrust/system/detail/generic/memory.h
@@ -28,10 +28,8 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/pair.h>
-#include <thrust/system/detail/generic/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -59,14 +57,14 @@ template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
 void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer);
 
-template<typename Pointer1, typename Pointer2>
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 __host__ __device__
-void iter_swap(tag, Pointer1, Pointer2);
+void iter_swap(thrust::execution_policy<DerivedPolicy>&, Pointer1, Pointer2);
 
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/memory.inl>
 
diff --git a/thrust/system/detail/generic/memory.inl b/thrust/system/detail/generic/memory.inl
index 69645d0f3..b85729098 100644
--- a/thrust/system/detail/generic/memory.inl
+++ b/thrust/system/detail/generic/memory.inl
@@ -14,16 +14,16 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
 #include <thrust/system/detail/generic/memory.h>
-#include <thrust/system/detail/generic/select_system.h>
 #include <thrust/system/detail/adl/malloc_and_free.h>
 #include <thrust/detail/static_assert.h>
 #include <thrust/detail/malloc_and_free.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -36,8 +36,10 @@ template<typename DerivedPolicy, typename Size>
 __host__ __device__
   void malloc(thrust::execution_policy<DerivedPolicy> &, Size)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Size, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Size, false>::value)
+  , "unimplemented for this system"
+  );
 }
 
 
@@ -56,8 +58,10 @@ template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
   void free(thrust::execution_policy<DerivedPolicy> &, Pointer)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer, false>::value)
+  , "unimplemented for this system"
+  );
 }
 
 
@@ -65,8 +69,10 @@ template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 __host__ __device__
 void assign_value(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer1, false>::value)
+  , "unimplemented for this system"
+  );
 }
 
 
@@ -74,22 +80,26 @@ template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
 void get_value(thrust::execution_policy<DerivedPolicy> &, Pointer)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer, false>::value)
+  , "unimplemented for this system"
+  );
 }
 
 
-template<typename Pointer1, typename Pointer2>
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 __host__ __device__
-void iter_swap(tag, Pointer1, Pointer2)
+void iter_swap(thrust::execution_policy<DerivedPolicy> &, Pointer1, Pointer2)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<Pointer1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<Pointer1, false>::value)
+  , "unimplemented for this system"
+  );
 }
 
 
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/merge.h b/thrust/system/detail/generic/merge.h
index d80906e3d..6e8246407 100644
--- a/thrust/system/detail/generic/merge.h
+++ b/thrust/system/detail/generic/merge.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -85,7 +84,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/merge.inl>
 
diff --git a/thrust/system/detail/generic/merge.inl b/thrust/system/detail/generic/merge.inl
index 8f6005aff..03b77e623 100644
--- a/thrust/system/detail/generic/merge.inl
+++ b/thrust/system/detail/generic/merge.inl
@@ -25,8 +25,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -41,16 +40,18 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename StrictWeakOrdering>
 __host__ __device__
-  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &exec,
-                       InputIterator1 first1,
-                       InputIterator1 last1,
-                       InputIterator2 first2,
-                       InputIterator2 last2,
+  OutputIterator merge(thrust::execution_policy<DerivedPolicy> &,
+                       InputIterator1,
+                       InputIterator1,
+                       InputIterator2,
+                       InputIterator2,
                        OutputIterator result,
-                       StrictWeakOrdering comp)
+                       StrictWeakOrdering)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end merge()
 
@@ -125,5 +126,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/mismatch.h b/thrust/system/detail/generic/mismatch.h
index 50e9f678b..4a71cd344 100644
--- a/thrust/system/detail/generic/mismatch.h
+++ b/thrust/system/detail/generic/mismatch.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,7 +51,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/mismatch.inl>
 
diff --git a/thrust/system/detail/generic/mismatch.inl b/thrust/system/detail/generic/mismatch.inl
index d879a6e11..f6b9674a1 100644
--- a/thrust/system/detail/generic/mismatch.inl
+++ b/thrust/system/detail/generic/mismatch.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/mismatch.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/find.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -38,10 +39,9 @@ __host__ __device__
              InputIterator1 last1,
              InputIterator2 first2)
 {
-  typedef typename thrust::iterator_value<InputIterator1>::type InputType1;
-  
-  // XXX use a placeholder expression here
-  return thrust::mismatch(exec, first1, last1, first2, thrust::detail::equal_to<InputType1>());
+  using namespace thrust::placeholders;
+
+  return thrust::mismatch(exec, first1, last1, first2, _1 == _2);
 } // end mismatch()
 
 
@@ -57,12 +57,12 @@ __host__ __device__
   // Contributed by Erich Elsen
   typedef thrust::tuple<InputIterator1,InputIterator2> IteratorTuple;
   typedef thrust::zip_iterator<IteratorTuple>          ZipIterator;
-  
+
   ZipIterator zipped_first = thrust::make_zip_iterator(thrust::make_tuple(first1,first2));
   ZipIterator zipped_last  = thrust::make_zip_iterator(thrust::make_tuple(last1, first2));
-  
+
   ZipIterator result = thrust::find_if_not(exec, zipped_first, zipped_last, thrust::detail::tuple_binary_predicate<BinaryPredicate>(pred));
-  
+
   return thrust::make_pair(thrust::get<0>(result.get_iterator_tuple()),
                            thrust::get<1>(result.get_iterator_tuple()));
 } // end mismatch()
@@ -71,5 +71,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/partition.h b/thrust/system/detail/generic/partition.h
index fdd158c4c..113d6ecbc 100644
--- a/thrust/system/detail/generic/partition.h
+++ b/thrust/system/detail/generic/partition.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -164,7 +163,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/partition.inl>
 
diff --git a/thrust/system/detail/generic/partition.inl b/thrust/system/detail/generic/partition.inl
index 73a8a286e..ab56fdd57 100644
--- a/thrust/system/detail/generic/partition.inl
+++ b/thrust/system/detail/generic/partition.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/partition.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -29,8 +31,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -244,5 +245,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/cuda/detail/synchronize.h b/thrust/system/detail/generic/per_device_resource.h
similarity index 60%
rename from thrust/system/cuda/detail/synchronize.h
rename to thrust/system/detail/generic/per_device_resource.h
index c57bac2ac..606f91f36 100644
--- a/thrust/system/cuda/detail/synchronize.h
+++ b/thrust/system/detail/generic/per_device_resource.h
@@ -14,37 +14,33 @@
  *  limitations under the License.
  */
 
-
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/system/detail/generic/tag.h>
+#include <thrust/mr/memory_resource.h>
+#include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
-namespace cuda
-{
 namespace detail
 {
+namespace generic
+{
 
 
-inline __host__ __device__
-void synchronize(const char *message = "");
-
-inline __host__ __device__
-void synchronize(cudaStream_t stream, const char *message = "");
-
-
-inline __host__ __device__
-void synchronize_if_enabled(const char *message = "");
-
-
+template<typename MR, typename DerivedPolicy>
+__host__
+MR * get_per_device_resource(thrust::detail::execution_policy_base<DerivedPolicy>&)
+{
+    return mr::get_global_resource<MR>();
+}
 
-} // end namespace detail
-} // end namespace cuda
-} // end namespace system
-} // end namespace thrust
 
-#include <thrust/system/cuda/detail/synchronize.inl>
+} // end generic
+} // end detail
+} // end system
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reduce.h b/thrust/system/detail/generic/reduce.h
index c3e7af0d2..f28b11a87 100644
--- a/thrust/system/detail/generic/reduce.h
+++ b/thrust/system/detail/generic/reduce.h
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -53,7 +52,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reduce.inl>
 
diff --git a/thrust/system/detail/generic/reduce.inl b/thrust/system/detail/generic/reduce.inl
index d7ce56380..d673d0cf8 100644
--- a/thrust/system/detail/generic/reduce.inl
+++ b/thrust/system/detail/generic/reduce.inl
@@ -16,14 +16,15 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/reduce.h>
 #include <thrust/system/detail/generic/reduce.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/functional.h>
 #include <thrust/detail/static_assert.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,14 +59,16 @@ template<typename ExecutionPolicy,
          typename OutputType,
          typename BinaryFunction>
 __host__ __device__
-  OutputType reduce(thrust::execution_policy<ExecutionPolicy> &exec,
-                    RandomAccessIterator first,
-                    RandomAccessIterator last,
-                    OutputType init,
-                    BinaryFunction binary_op)
+  OutputType reduce(thrust::execution_policy<ExecutionPolicy> &,
+                    RandomAccessIterator,
+                    RandomAccessIterator,
+                    OutputType,
+                    BinaryFunction)
 {
-  // unimplemented
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value)
+  , "unimplemented for this system"
+  );
   return OutputType();
 } // end reduce()
 
@@ -73,5 +76,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reduce_by_key.h b/thrust/system/detail/generic/reduce_by_key.h
index aaa5959a4..8ba47e11f 100644
--- a/thrust/system/detail/generic/reduce_by_key.h
+++ b/thrust/system/detail/generic/reduce_by_key.h
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -83,7 +82,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reduce_by_key.inl>
 
diff --git a/thrust/system/detail/generic/reduce_by_key.inl b/thrust/system/detail/generic/reduce_by_key.inl
index 49f362a49..2ea73feda 100644
--- a/thrust/system/detail/generic/reduce_by_key.inl
+++ b/thrust/system/detail/generic/reduce_by_key.inl
@@ -14,13 +14,10 @@
  *  limitations under the License.
  */
 
-
-/*! \file reduce_by_key.inl
- *  \brief Inline file for reduce_by_key.h.
- */
-
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/detail/minimum_system.h>
 #include <thrust/detail/type_traits.h>
@@ -35,8 +32,7 @@
 #include <thrust/scan.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,12 +47,12 @@ template <typename ValueType, typename TailFlagType, typename AssociativeOperato
 struct reduce_by_key_functor
 {
   AssociativeOperator binary_op;
-  
+
   typedef typename thrust::tuple<ValueType, TailFlagType> result_type;
-  
+
   __host__ __device__
   reduce_by_key_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-  
+
   __host__ __device__
   result_type operator()(result_type a, result_type b)
   {
@@ -79,7 +75,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -91,27 +87,8 @@ __host__ __device__
 
     typedef unsigned int FlagType;  // TODO use difference_type
 
-    // the pseudocode for deducing the type of the temporary used below:
-    // 
-    // if BinaryFunction is AdaptableBinaryFunction
-    //   TemporaryType = AdaptableBinaryFunction::result_type
-    // else if OutputIterator2 is a "pure" output iterator
-    //   TemporaryType = InputIterator2::value_type
-    // else
-    //   TemporaryType = OutputIterator2::value_type
-    //
-    // XXX upon c++0x, TemporaryType needs to be:
-    // result_of<BinaryFunction>::type
-
-    typedef typename thrust::detail::eval_if<
-      thrust::detail::has_result_type<BinaryFunction>::value,
-      thrust::detail::result_type<BinaryFunction>,
-      thrust::detail::eval_if<
-        thrust::detail::is_output_iterator<OutputIterator2>::value,
-        thrust::iterator_value<InputIterator2>,
-        thrust::iterator_value<OutputIterator2>
-      >
-    >::type ValueType;
+    // Use the input iterator's value type per https://wg21.link/P0571
+    using ValueType = typename thrust::iterator_value<InputIterator2>::type;
 
     if (keys_first == keys_last)
         return thrust::make_pair(keys_output, values_output);
@@ -120,7 +97,7 @@ __host__ __device__
     difference_type n = keys_last - keys_first;
 
     InputIterator2 values_last = values_first + n;
-    
+
     // compute head flags
     thrust::detail::temporary_array<FlagType,ExecutionPolicy> head_flags(exec, n);
     thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, head_flags.begin() + 1, thrust::detail::not2(binary_pred));
@@ -134,7 +111,7 @@ __host__ __device__
     // scan the values by flag
     thrust::detail::temporary_array<ValueType,ExecutionPolicy> scanned_values(exec, n);
     thrust::detail::temporary_array<FlagType,ExecutionPolicy>  scanned_tail_flags(exec, n);
-    
+
     thrust::inclusive_scan
         (exec,
          thrust::make_zip_iterator(thrust::make_tuple(values_first,           head_flags.begin())),
@@ -146,12 +123,12 @@ __host__ __device__
 
     // number of unique keys
     FlagType N = scanned_tail_flags[n - 1] + 1;
-    
-    // scatter the keys and accumulated values    
+
+    // scatter the keys and accumulated values
     thrust::scatter_if(exec, keys_first,            keys_last,             scanned_tail_flags.begin(), head_flags.begin(), keys_output);
     thrust::scatter_if(exec, scanned_values.begin(), scanned_values.end(), scanned_tail_flags.begin(), tail_flags.begin(), values_output);
 
-    return thrust::make_pair(keys_output + N, values_output + N); 
+    return thrust::make_pair(keys_output + N, values_output + N);
 } // end reduce_by_key()
 
 
@@ -163,7 +140,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -185,7 +162,7 @@ template<typename ExecutionPolicy,
 __host__ __device__
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -200,7 +177,7 @@ __host__ __device__
 
   // use plus<T> as default BinaryFunction
   return thrust::reduce_by_key(exec,
-                               keys_first, keys_last, 
+                               keys_first, keys_last,
                                values_first,
                                keys_output,
                                values_output,
@@ -212,5 +189,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/remove.h b/thrust/system/detail/generic/remove.h
index 343f643e9..37354ef80 100644
--- a/thrust/system/detail/generic/remove.h
+++ b/thrust/system/detail/generic/remove.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -107,7 +106,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/remove.inl>
 
diff --git a/thrust/system/detail/generic/remove.inl b/thrust/system/detail/generic/remove.inl
index 6cb5a694b..e51a3caee 100644
--- a/thrust/system/detail/generic/remove.inl
+++ b/thrust/system/detail/generic/remove.inl
@@ -14,10 +14,7 @@
  *  limitations under the License.
  */
 
-
-/*! \file remove.inl
- *  \brief Inline file for remove.h
- */
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/remove.h>
@@ -27,8 +24,7 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -108,7 +104,7 @@ __host__ __device__
 
   // remove into temp
   return thrust::remove_copy_if(exec, temp.begin(), temp.end(), stencil, first, pred);
-} // end remove_if() 
+} // end remove_if()
 
 
 template<typename DerivedPolicy,
@@ -146,5 +142,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/replace.h b/thrust/system/detail/generic/replace.h
index 6167f711a..0821d6c07 100644
--- a/thrust/system/detail/generic/replace.h
+++ b/thrust/system/detail/generic/replace.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -92,7 +91,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/replace.inl>
 
diff --git a/thrust/system/detail/generic/replace.inl b/thrust/system/detail/generic/replace.inl
index ad6f821aa..ed845dd45 100644
--- a/thrust/system/detail/generic/replace.inl
+++ b/thrust/system/detail/generic/replace.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
+#include <thrust/functional.h>
 #include <thrust/system/detail/generic/replace.h>
 #include <thrust/transform.h>
 #include <thrust/replace.h>
-#include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -55,7 +56,7 @@ template<typename Predicate, typename NewType, typename OutputType>
   {
     return pred(y) ? new_value : x;
   } // end operator()()
-  
+
   Predicate pred;
   NewType new_value;
 }; // end new_value_if
@@ -70,7 +71,7 @@ template<typename T>
 
   template<typename U>
   __host__ __device__
-  T operator()(U &x)
+  T operator()(U &)
   {
     return c;
   } // end operator()()
@@ -124,8 +125,9 @@ __host__ __device__
                               const T &old_value,
                               const T &new_value)
 {
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_copy_if(exec, first, last, result, pred, new_value);
+  using thrust::placeholders::_1;
+
+  return thrust::replace_copy_if(exec, first, last, result, _1 == old_value, new_value);
 } // end replace_copy()
 
 
@@ -164,13 +166,14 @@ __host__ __device__
                const T &old_value,
                const T &new_value)
 {
-  thrust::detail::equal_to_value<T> pred(old_value);
-  return thrust::replace_if(exec, first, last, pred, new_value);
+  using thrust::placeholders::_1;
+
+  return thrust::replace_if(exec, first, last, _1 == old_value, new_value);
 } // end replace()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/reverse.h b/thrust/system/detail/generic/reverse.h
index 11421d41b..65c77ae75 100644
--- a/thrust/system/detail/generic/reverse.h
+++ b/thrust/system/detail/generic/reverse.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/reverse.inl>
 
diff --git a/thrust/system/detail/generic/reverse.inl b/thrust/system/detail/generic/reverse.inl
index b77c75b6f..1ce6db38b 100644
--- a/thrust/system/detail/generic/reverse.inl
+++ b/thrust/system/detail/generic/reverse.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/reverse.h>
 #include <thrust/advance.h>
@@ -23,8 +25,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/iterator/reverse_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -70,6 +71,6 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/detail/generic/scalar/binary_search.h b/thrust/system/detail/generic/scalar/binary_search.h
index 373b59a60..3e019c223 100644
--- a/thrust/system/detail/generic/scalar/binary_search.h
+++ b/thrust/system/detail/generic/scalar/binary_search.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -79,7 +78,7 @@ bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scalar/binary_search.inl>
 
diff --git a/thrust/system/detail/generic/scalar/binary_search.inl b/thrust/system/detail/generic/scalar/binary_search.inl
index 06a240f1e..61c71fba4 100644
--- a/thrust/system/detail/generic/scalar/binary_search.inl
+++ b/thrust/system/detail/generic/scalar/binary_search.inl
@@ -21,8 +21,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -52,7 +51,7 @@ RandomAccessIterator lower_bound_n(RandomAccessIterator first,
   Size start = 0, i;
   while(start < n)
   {
-    i = (start + n) / 2;
+    i = start + (n - start) / 2;  // Overflow-safe variant of (a+b)/2
     if(wrapped_comp(first[i], val))
     {
       start = i + 1;
@@ -62,7 +61,7 @@ RandomAccessIterator lower_bound_n(RandomAccessIterator first,
       n = i;
     }
   } // end while
-  
+
   return first + start;
 }
 
@@ -94,7 +93,7 @@ RandomAccessIterator upper_bound_n(RandomAccessIterator first,
   Size start = 0, i;
   while(start < n)
   {
-    i = (start + n) / 2;
+    i = start + (n - start) / 2;  // Overflow-safe variant of (a+b)/2
     if(wrapped_comp(val, first[i]))
     {
       n = i;
@@ -104,7 +103,7 @@ RandomAccessIterator upper_bound_n(RandomAccessIterator first,
       start = i + 1;
     }
   } // end while
-  
+
   return first + start;
 }
 
@@ -153,7 +152,6 @@ bool binary_search(RandomAccessIterator first, RandomAccessIterator last, const
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scalar/binary_search.inl>
-
diff --git a/thrust/system/detail/generic/scan.h b/thrust/system/detail/generic/scan.h
index c32b0f2b9..476441ab6 100644
--- a/thrust/system/detail/generic/scan.h
+++ b/thrust/system/detail/generic/scan.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -93,7 +92,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scan.inl>
 
diff --git a/thrust/system/detail/generic/scan.inl b/thrust/system/detail/generic/scan.inl
index 95e7c5aeb..45a2aadd0 100644
--- a/thrust/system/detail/generic/scan.inl
+++ b/thrust/system/detail/generic/scan.inl
@@ -26,8 +26,7 @@
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -45,21 +44,8 @@ __host__ __device__
                                 InputIterator last,
                                 OutputIterator result)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
   // assume plus as the associative operator
-  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<ValueType>());
+  return thrust::inclusive_scan(exec, first, last, result, thrust::plus<>());
 } // end inclusive_scan()
 
 
@@ -72,21 +58,9 @@ __host__ __device__
                                 InputIterator last,
                                 OutputIterator result)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-
-  typedef typename thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-  >::type ValueType;
-
-  // assume 0 as the initialization value
-  return thrust::exclusive_scan(exec, first, last, result, ValueType(0));
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+  return thrust::exclusive_scan(exec, first, last, result, ValueType{});
 } // end exclusive_scan()
 
 
@@ -102,7 +76,7 @@ __host__ __device__
                                 T init)
 {
   // assume plus as the associative operator
-  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<T>());
+  return thrust::exclusive_scan(exec, first, last, result, init, thrust::plus<>());
 } // end exclusive_scan()
 
 
@@ -111,14 +85,16 @@ template<typename ExecutionPolicy,
          typename OutputIterator,
          typename BinaryFunction>
 __host__ __device__
-  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
+  OutputIterator inclusive_scan(thrust::execution_policy<ExecutionPolicy> &,
+                                InputIterator,
+                                InputIterator,
                                 OutputIterator result,
-                                BinaryFunction binary_op)
+                                BinaryFunction)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end inclusive_scan
 
@@ -129,15 +105,17 @@ template<typename ExecutionPolicy,
          typename T,
          typename BinaryFunction>
 __host__ __device__
-  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
-                                InputIterator first,
-                                InputIterator last,
+  OutputIterator exclusive_scan(thrust::execution_policy<ExecutionPolicy> &,
+                                InputIterator,
+                                InputIterator,
                                 OutputIterator result,
-                                T init,
-                                BinaryFunction binary_op)
+                                T,
+                                BinaryFunction)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end exclusive_scan()
 
@@ -145,5 +123,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/scan_by_key.h b/thrust/system/detail/generic/scan_by_key.h
index 3c2ea7931..9e38ac933 100644
--- a/thrust/system/detail/generic/scan_by_key.h
+++ b/thrust/system/detail/generic/scan_by_key.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -138,7 +137,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scan_by_key.inl>
 
diff --git a/thrust/system/detail/generic/scan_by_key.inl b/thrust/system/detail/generic/scan_by_key.inl
index 129cef17b..0e3100224 100644
--- a/thrust/system/detail/generic/scan_by_key.inl
+++ b/thrust/system/detail/generic/scan_by_key.inl
@@ -14,8 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/cstdint.h>
 #include <thrust/system/detail/generic/scan_by_key.h>
 #include <thrust/functional.h>
 #include <thrust/transform.h>
@@ -26,8 +28,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,12 +43,12 @@ template <typename OutputType, typename HeadFlagType, typename AssociativeOperat
 struct segmented_scan_functor
 {
   AssociativeOperator binary_op;
-  
+
   typedef typename thrust::tuple<OutputType, HeadFlagType> result_type;
-  
+
   __host__ __device__
   segmented_scan_functor(AssociativeOperator _binary_op) : binary_op(_binary_op) {}
-  
+
   __host__ __device__
   result_type operator()(result_type a, result_type b)
   {
@@ -71,8 +72,7 @@ __host__ __device__
                                        InputIterator2 first2,
                                        OutputIterator result)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<InputType1>());
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, thrust::equal_to<>());
 }
 
 
@@ -89,8 +89,7 @@ __host__ __device__
                                        OutputIterator result,
                                        BinaryPredicate binary_pred)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<OutputType>());
+  return thrust::inclusive_scan_by_key(exec, first1, last1, first2, result, binary_pred, thrust::plus<>());
 }
 
 
@@ -109,8 +108,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        AssociativeOperator binary_op)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
+  using OutputType = typename thrust::iterator_traits<InputIterator2>::value_type;
+  using HeadFlagType = thrust::detail::uint8_t;
 
   const size_t n = last1 - first1;
 
@@ -120,7 +119,7 @@ __host__ __device__
     thrust::detail::temporary_array<HeadFlagType,DerivedPolicy> flags(exec, n);
     flags[0] = 1; thrust::transform(exec, first1, last1 - 1, first1 + 1, flags.begin() + 1, thrust::detail::not2(binary_pred));
 
-    // scan key-flag tuples, 
+    // scan key-flag tuples,
     // For additional details refer to Section 2 of the following paper
     //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
     //    NVIDIA Technical Report NVR-2008-003, December 2008
@@ -147,8 +146,8 @@ __host__ __device__
                                        InputIterator2 first2,
                                        OutputIterator result)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, OutputType(0));
+  typedef typename thrust::iterator_traits<InputIterator2>::value_type InitType;
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, InitType{});
 }
 
 
@@ -165,8 +164,7 @@ __host__ __device__
                                        OutputIterator result,
                                        T init)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type InputType1;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<InputType1>());
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, thrust::equal_to<>());
 }
 
 
@@ -185,8 +183,7 @@ __host__ __device__
                                        T init,
                                        BinaryPredicate binary_pred)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<OutputType>());
+  return thrust::exclusive_scan_by_key(exec, first1, last1, first2, result, init, binary_pred, thrust::plus<>());
 }
 
 
@@ -207,8 +204,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        AssociativeOperator binary_op)
 {
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type OutputType;
-  typedef unsigned int HeadFlagType;
+  using OutputType = T;
+  using HeadFlagType = thrust::detail::uint8_t;
 
   const size_t n = last1 - first1;
 
@@ -225,7 +222,7 @@ __host__ __device__
     thrust::replace_copy_if(exec, first2, last2 - 1, flags.begin() + 1, temp.begin() + 1, thrust::negate<HeadFlagType>(), init);
     temp[0] = init;
 
-    // scan key-flag tuples, 
+    // scan key-flag tuples,
     // For additional details refer to Section 2 of the following paper
     //    S. Sengupta, M. Harris, and M. Garland. "Efficient parallel scan algorithms for GPUs"
     //    NVIDIA Technical Report NVR-2008-003, December 2008
@@ -244,5 +241,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/scatter.h b/thrust/system/detail/generic/scatter.h
index 4a65a4cc0..6bb7949ef 100644
--- a/thrust/system/detail/generic/scatter.h
+++ b/thrust/system/detail/generic/scatter.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -75,7 +74,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/scatter.inl>
 
diff --git a/thrust/system/detail/generic/scatter.inl b/thrust/system/detail/generic/scatter.inl
index 7a1f52298..5b4798708 100644
--- a/thrust/system/detail/generic/scatter.inl
+++ b/thrust/system/detail/generic/scatter.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/scatter.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/transform.h>
 #include <thrust/iterator/permutation_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -92,5 +93,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/select_system.h b/thrust/system/detail/generic/select_system.h
index 267d7a6f7..7619b80e5 100644
--- a/thrust/system/detail/generic/select_system.h
+++ b/thrust/system/detail/generic/select_system.h
@@ -1,3 +1,4 @@
+
 /*
  *  Copyright 2008-2013 NVIDIA Corporation
  *
@@ -19,67 +20,35 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/type_traits/is_metafunction_defined.h>
 #include <thrust/iterator/detail/minimum_system.h>
-#include <thrust/system/detail/generic/type_traits.h>
-#include <thrust/iterator/iterator_traits.h>
+#include <thrust/iterator/detail/device_system_tag.h>
+#include <thrust/iterator/detail/any_system_tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
 {
 namespace generic
 {
-namespace select_system_detail
-{
-
-
-// min_system case 1: both systems have the same type, just return the first one
-template<typename System>
-__host__ __device__
-System &min_system(thrust::execution_policy<System> &system1,
-                   thrust::execution_policy<System> &)
-{
-  return thrust::detail::derived_cast(system1);
-} // end min_system()
 
+template<typename Tag>
+  struct select_system1_exists;
 
-// min_system case 2: systems have differing type and the first type is considered the minimum
-template<typename System1, typename System2>
-__host__ __device__
-  typename thrust::detail::enable_if<
-    thrust::detail::is_same<
-      System1,
-      typename thrust::detail::minimum_system<System1,System2>::type
-    >::value,
-    System1 &
-  >::type
-    min_system(thrust::execution_policy<System1> &system1, thrust::execution_policy<System2> &)
-{
-  return thrust::detail::derived_cast(system1);
-} // end min_system()
+template<typename Tag1, typename Tag2>
+  struct select_system2_exists;
 
+template<typename Tag1, typename Tag2, typename Tag3>
+  struct select_system3_exists;
 
-// min_system case 3: systems have differing type and the second type is considered the minimum
-template<typename System1, typename System2>
-__host__ __device__
-  typename thrust::detail::enable_if<
-    thrust::detail::is_same<
-      System2,
-      typename thrust::detail::minimum_system<System1,System2>::type
-    >::value,
-    System2 &
-  >::type
-    min_system(thrust::execution_policy<System1> &, thrust::execution_policy<System2> &system2)
-{
-  return thrust::detail::derived_cast(system2);
-} // end min_system()
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4>
+  struct select_system4_exists;
 
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5>
+  struct select_system5_exists;
 
-} // end select_system_detail
-
+template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Tag5, typename Tag6>
+  struct select_system6_exists;
 
 template<typename System>
 __host__ __device__
@@ -87,11 +56,7 @@ __host__ __device__
     select_system1_exists<System>::value,
     System &
   >::type
-    select_system(thrust::execution_policy<System> &system)
-{
-  return thrust::detail::derived_cast(system);
-} // end select_system()
-
+    select_system(thrust::execution_policy<System> &system);
 
 template<typename System1, typename System2>
 __host__ __device__
@@ -99,11 +64,7 @@ __host__ __device__
     thrust::detail::minimum_system<System1,System2>
   >::type
     &select_system(thrust::execution_policy<System1> &system1,
-                   thrust::execution_policy<System2> &system2)
-{
-  return select_system_detail::min_system(system1,system2);
-} // end select_system()
-
+                   thrust::execution_policy<System2> &system2);
 
 template<typename System1, typename System2, typename System3>
 __host__ __device__
@@ -113,11 +74,7 @@ __host__ __device__
   >::type
     &select_system(thrust::execution_policy<System1> &system1,
                    thrust::execution_policy<System2> &system2,
-                   thrust::execution_policy<System3> &system3)
-{
-  return select_system(select_system(system1,system2), system3);
-} // end select_system()
-
+                   thrust::execution_policy<System3> &system3);
 
 template<typename System1, typename System2, typename System3, typename System4>
 __host__ __device__
@@ -128,11 +85,7 @@ __host__ __device__
     &select_system(thrust::execution_policy<System1> &system1,
                    thrust::execution_policy<System2> &system2,
                    thrust::execution_policy<System3> &system3,
-                   thrust::execution_policy<System4> &system4)
-{
-  return select_system(select_system(system1,system2,system3), system4);
-} // end select_system()
-
+                   thrust::execution_policy<System4> &system4);
 
 template<typename System1, typename System2, typename System3, typename System4, typename System5>
 __host__ __device__
@@ -144,11 +97,7 @@ __host__ __device__
                    thrust::execution_policy<System2> &system2,
                    thrust::execution_policy<System3> &system3,
                    thrust::execution_policy<System4> &system4,
-                   thrust::execution_policy<System5> &system5)
-{
-  return select_system(select_system(system1,system2,system3,system4), system5);
-} // end select_system()
-
+                   thrust::execution_policy<System5> &system5);
 
 template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
 __host__ __device__
@@ -161,22 +110,15 @@ __host__ __device__
                    thrust::execution_policy<System3> &system3,
                    thrust::execution_policy<System4> &system4,
                    thrust::execution_policy<System5> &system5,
-                   thrust::execution_policy<System6> &system6)
-{
-  return select_system(select_system(system1,system2,system3,system4,system5), system6);
-} // end select_system()
+                   thrust::execution_policy<System6> &system6);
 
-
-// map a single any_system_tag to device_system_tag
+// Map a single any_system_tag to device_system_tag.
 inline __host__ __device__
-thrust::device_system_tag select_system(thrust::any_system_tag)
-{
-  return thrust::device_system_tag();
-} // end select_system()
-
+thrust::device_system_tag select_system(thrust::any_system_tag);
 
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
+#include <thrust/system/detail/generic/select_system.inl>
diff --git a/thrust/system/detail/generic/select_system.inl b/thrust/system/detail/generic/select_system.inl
new file mode 100644
index 000000000..b69d17c45
--- /dev/null
+++ b/thrust/system/detail/generic/select_system.inl
@@ -0,0 +1,178 @@
+/*
+ *  Copyright 2008-2013 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/execution_policy.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/system/detail/generic/select_system_exists.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system
+{
+namespace detail
+{
+namespace generic
+{
+namespace select_system_detail
+{
+
+
+// min_system case 1: both systems have the same type, just return the first one
+template<typename System>
+__host__ __device__
+System &min_system(thrust::execution_policy<System> &system1,
+                   thrust::execution_policy<System> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 2: systems have differing type and the first type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+typename thrust::detail::enable_if<
+  thrust::detail::is_same<
+    System1,
+    typename thrust::detail::minimum_system<System1,System2>::type
+  >::value,
+  System1 &
+>::type
+  min_system(thrust::execution_policy<System1> &system1, thrust::execution_policy<System2> &)
+{
+  return thrust::detail::derived_cast(system1);
+} // end min_system()
+
+
+// min_system case 3: systems have differing type and the second type is considered the minimum
+template<typename System1, typename System2>
+__host__ __device__
+typename thrust::detail::enable_if<
+  thrust::detail::is_same<
+    System2,
+    typename thrust::detail::minimum_system<System1,System2>::type
+  >::value,
+    System2 &
+  >::type
+    min_system(thrust::execution_policy<System1> &, thrust::execution_policy<System2> &system2)
+{
+  return thrust::detail::derived_cast(system2);
+} // end min_system()
+
+
+} // end select_system_detail
+
+
+template<typename System>
+__host__ __device__
+  typename thrust::detail::disable_if<
+    select_system1_exists<System>::value,
+    System &
+  >::type
+    select_system(thrust::execution_policy<System> &system)
+{
+  return thrust::detail::derived_cast(system);
+} // end select_system()
+
+
+template<typename System1, typename System2>
+__host__ __device__
+  typename thrust::detail::enable_if_defined<
+    thrust::detail::minimum_system<System1,System2>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2)
+{
+  return select_system_detail::min_system(system1,system2);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system3_exists<System1,System2,System3>::value,
+    thrust::detail::minimum_system<System1,System2,System3>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3)
+{
+  return select_system(select_system(system1,system2), system3);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system4_exists<System1,System2,System3,System4>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4)
+{
+  return select_system(select_system(system1,system2,system3), system4);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system5_exists<System1,System2,System3,System4,System5>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5)
+{
+  return select_system(select_system(system1,system2,system3,system4), system5);
+} // end select_system()
+
+
+template<typename System1, typename System2, typename System3, typename System4, typename System5, typename System6>
+__host__ __device__
+  typename thrust::detail::lazy_disable_if<
+    select_system6_exists<System1,System2,System3,System4,System5,System6>::value,
+    thrust::detail::minimum_system<System1,System2,System3,System4,System5,System6>
+  >::type
+    &select_system(thrust::execution_policy<System1> &system1,
+                   thrust::execution_policy<System2> &system2,
+                   thrust::execution_policy<System3> &system3,
+                   thrust::execution_policy<System4> &system4,
+                   thrust::execution_policy<System5> &system5,
+                   thrust::execution_policy<System6> &system6)
+{
+  return select_system(select_system(system1,system2,system3,system4,system5), system6);
+} // end select_system()
+
+
+// map a single any_system_tag to device_system_tag
+inline __host__ __device__
+thrust::device_system_tag select_system(thrust::any_system_tag)
+{
+  return thrust::device_system_tag();
+} // end select_system()
+
+
+} // end generic
+} // end detail
+} // end system
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/detail/generic/type_traits.h b/thrust/system/detail/generic/select_system_exists.h
similarity index 99%
rename from thrust/system/detail/generic/type_traits.h
rename to thrust/system/detail/generic/select_system_exists.h
index ba8ef8bb7..29d05781d 100644
--- a/thrust/system/detail/generic/type_traits.h
+++ b/thrust/system/detail/generic/select_system_exists.h
@@ -23,8 +23,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 // forward declaration of any_system_tag for any_conversion below
 struct any_system_tag;
@@ -164,5 +163,5 @@ template<typename Tag1, typename Tag2, typename Tag3, typename Tag4, typename Ta
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/sequence.h b/thrust/system/detail/generic/sequence.h
index a7bc842ae..26bf17bb8 100644
--- a/thrust/system/detail/generic/sequence.h
+++ b/thrust/system/detail/generic/sequence.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/sequence.inl>
 
diff --git a/thrust/system/detail/generic/sequence.inl b/thrust/system/detail/generic/sequence.inl
index 53f54c5f0..0e11dd75d 100644
--- a/thrust/system/detail/generic/sequence.inl
+++ b/thrust/system/detail/generic/sequence.inl
@@ -14,43 +14,20 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/sequence.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/tabulate.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
 {
 namespace generic
 {
-namespace sequence_detail
-{
-
-
-template<typename T>
-struct sequence_functor
-{
-  T init, step;
-
-  __host__ __device__
-  sequence_functor(T init, T step)
-    : init(init), step(step)
-  {}
-
-  template<typename Index>
-  __host__ __device__
-  T operator()(Index i) const
-  {
-    return init + step * i;
-  }
-};
-
-
-} // end sequence_detail
 
 
 template<typename DerivedPolicy, typename ForwardIterator>
@@ -75,6 +52,35 @@ __host__ __device__
   thrust::sequence(exec, first, last, init, T(1));
 } // end sequence()
 
+namespace detail
+{
+template <typename T, typename = void>
+struct compute_sequence_value
+{
+  T init;
+  T step;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  T operator()(std::size_t i) const
+  {
+    return init + step * i;
+  }
+};
+template <typename T>
+struct compute_sequence_value<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
+{
+  T init;
+  T step;
+
+  __thrust_exec_check_disable__
+  __host__ __device__
+  T operator()(std::size_t i) const
+  {
+    return init + step * static_cast<T>(i);
+  }
+};
+}
 
 template<typename DerivedPolicy, typename ForwardIterator, typename T>
 __host__ __device__
@@ -84,13 +90,17 @@ __host__ __device__
                 T init,
                 T step)
 {
-  // XXX TODO use a placeholder expression here
-  thrust::tabulate(exec, first, last, sequence_detail::sequence_functor<T>(init, step));
+
+  thrust::tabulate(exec,
+                   first,
+                   last,
+                   detail::compute_sequence_value<T>{std::move(init),
+                                                     std::move(step)});
 } // end sequence()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/set_operations.h b/thrust/system/detail/generic/set_operations.h
index 4dbee0ae4..37665d78d 100644
--- a/thrust/system/detail/generic/set_operations.h
+++ b/thrust/system/detail/generic/set_operations.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -313,7 +312,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/set_operations.inl>
 
diff --git a/thrust/system/detail/generic/set_operations.inl b/thrust/system/detail/generic/set_operations.inl
index a804758db..4363be5c0 100644
--- a/thrust/system/detail/generic/set_operations.inl
+++ b/thrust/system/detail/generic/set_operations.inl
@@ -25,8 +25,7 @@
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -388,16 +387,18 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename StrictWeakOrdering>
 __host__ __device__
-OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                              InputIterator1                           first1,
-                              InputIterator1                           last1,
-                              InputIterator2                           first2,
-                              InputIterator2                           last2,
-                              OutputIterator                           result,
-                              StrictWeakOrdering                       comp)
+OutputIterator set_difference(thrust::execution_policy<DerivedPolicy> &,
+                              InputIterator1,
+                              InputIterator1,
+                              InputIterator2,
+                              InputIterator2,
+                              OutputIterator  result,
+                              StrictWeakOrdering)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end set_difference()
 
@@ -408,16 +409,18 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename StrictWeakOrdering>
 __host__ __device__
-OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &exec,
-                                InputIterator1                           first1,
-                                InputIterator1                           last1,
-                                InputIterator2                           first2,
-                                InputIterator2                           last2,
-                                OutputIterator                           result,
-                                StrictWeakOrdering                       comp)
+OutputIterator set_intersection(thrust::execution_policy<DerivedPolicy> &,
+                                InputIterator1,
+                                InputIterator1,
+                                InputIterator2,
+                                InputIterator2,
+                                OutputIterator result,
+                                StrictWeakOrdering)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end set_intersection()
 
@@ -428,16 +431,18 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename StrictWeakOrdering>
 __host__ __device__
-OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &exec,
-                                        InputIterator1                           first1,
-                                        InputIterator1                           last1,
-                                        InputIterator2                           first2,
-                                        InputIterator2                           last2,
-                                        OutputIterator                           result,
-                                        StrictWeakOrdering                       comp)
+OutputIterator set_symmetric_difference(thrust::execution_policy<DerivedPolicy> &,
+                                        InputIterator1,
+                                        InputIterator1,
+                                        InputIterator2,
+                                        InputIterator2,
+                                        OutputIterator result,
+                                        StrictWeakOrdering)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end set_symmetric_difference()
 
@@ -448,16 +453,18 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename StrictWeakOrdering>
 __host__ __device__
-OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
-                         InputIterator1                           first1,
-                         InputIterator1                           last1,
-                         InputIterator2                           first2,
-                         InputIterator2                           last2,
-                         OutputIterator                           result,
-                         StrictWeakOrdering                       comp)
+OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &,
+                         InputIterator1,
+                         InputIterator1,
+                         InputIterator2,
+                         InputIterator2,
+                         OutputIterator result,
+                         StrictWeakOrdering)
 {
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator1, false>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<InputIterator1, false>::value)
+  , "unimplemented for this system"
+  );
   return result;
 } // end set_union()
 
@@ -465,5 +472,5 @@ OutputIterator set_union(thrust::execution_policy<DerivedPolicy> &exec,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/shuffle.h b/thrust/system/detail/generic/shuffle.h
new file mode 100644
index 000000000..8f8e21afd
--- /dev/null
+++ b/thrust/system/detail/generic/shuffle.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file shuffle.h
+ *  \brief Generic implementations of shuffle functions.
+ */
+
+#pragma once
+
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <thrust/system/detail/generic/tag.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system {
+namespace detail {
+namespace generic {
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g);
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g);
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+THRUST_NAMESPACE_END
+
+#include <thrust/system/detail/generic/shuffle.inl>
+
+#endif
diff --git a/thrust/system/detail/generic/shuffle.inl b/thrust/system/detail/generic/shuffle.inl
new file mode 100644
index 000000000..baece51be
--- /dev/null
+++ b/thrust/system/detail/generic/shuffle.inl
@@ -0,0 +1,190 @@
+/*
+ *  Copyright 2008-20120 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/temporary_array.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/transform_output_iterator.h>
+#include <thrust/random.h>
+#include <thrust/scan.h>
+#include <thrust/system/detail/generic/shuffle.h>
+
+#include <cstdint>
+
+THRUST_NAMESPACE_BEGIN
+namespace system {
+namespace detail {
+namespace generic {
+
+// An implementation of a Feistel cipher for operating on 64 bit keys
+class feistel_bijection {
+  struct round_state {
+    std::uint32_t left;
+    std::uint32_t right;
+  };
+
+ public:
+  template <class URBG>
+  __host__ __device__ feistel_bijection(std::uint64_t m, URBG&& g) {
+    std::uint64_t total_bits = get_cipher_bits(m);
+    // Half bits rounded down
+    left_side_bits = total_bits / 2;
+    left_side_mask = (1ull << left_side_bits) - 1;
+    // Half the bits rounded up
+    right_side_bits = total_bits - left_side_bits;
+    right_side_mask = (1ull << right_side_bits) - 1;
+
+    for (std::uint32_t i = 0; i < num_rounds; i++) {
+      key[i] = g();
+    }
+  }
+
+  __host__ __device__ std::uint64_t nearest_power_of_two() const {
+    return 1ull << (left_side_bits + right_side_bits);
+  }
+
+  __host__ __device__ std::uint64_t operator()(const std::uint64_t val) const {
+    std::uint32_t state[2] = { static_cast<std::uint32_t>( val >> right_side_bits ), static_cast<std::uint32_t>( val & right_side_mask ) };
+    for( std::uint32_t i = 0; i < num_rounds; i++ )
+    {
+        std::uint32_t hi, lo;
+        constexpr std::uint64_t M0 = UINT64_C( 0xD2B74407B1CE6E93 );
+        mulhilo( M0, state[0], hi, lo );
+        lo = ( lo << ( right_side_bits - left_side_bits ) ) | state[1] >> left_side_bits;
+        state[0] = ( ( hi ^ key[i] ) ^ state[1] ) & left_side_mask;
+        state[1] = lo & right_side_mask;
+    }
+    // Combine the left and right sides together to get result
+    return static_cast<std::uint64_t>(state[0] << right_side_bits) | static_cast<std::uint64_t>(state[1]);
+  }
+
+ private:
+   // Perform 64 bit multiplication and save result in two 32 bit int
+   static __host__ __device__ void mulhilo( std::uint64_t a, std::uint64_t b, std::uint32_t& hi, std::uint32_t& lo )
+   {
+       std::uint64_t product = a * b;
+       hi = static_cast<std::uint32_t>( product >> 32 );
+       lo = static_cast<std::uint32_t>( product );
+   }
+
+  // Find the nearest power of two
+  static __host__ __device__ std::uint64_t get_cipher_bits(std::uint64_t m) {
+    if (m <= 16) return 4;
+    std::uint64_t i = 0;
+    m--;
+    while (m != 0) {
+      i++;
+      m >>= 1;
+    }
+    return i;
+  }
+
+  static constexpr std::uint32_t num_rounds = 24;
+  std::uint64_t right_side_bits;
+  std::uint64_t left_side_bits;
+  std::uint64_t right_side_mask;
+  std::uint64_t left_side_mask;
+  std::uint32_t key[num_rounds];
+};
+
+struct key_flag_tuple {
+  std::uint64_t key;
+  std::uint64_t flag;
+};
+
+// scan only flags
+struct key_flag_scan_op {
+  __host__ __device__ key_flag_tuple operator()(const key_flag_tuple& a,
+                                                const key_flag_tuple& b) {
+    return {b.key, a.flag + b.flag};
+  }
+};
+
+struct construct_key_flag_op {
+  std::uint64_t m;
+  feistel_bijection bijection;
+  __host__ __device__ construct_key_flag_op(std::uint64_t m,
+                                            feistel_bijection bijection)
+      : m(m), bijection(bijection) {}
+  __host__ __device__ key_flag_tuple operator()(std::uint64_t idx) {
+    auto gather_key = bijection(idx);
+    return key_flag_tuple{gather_key, (gather_key < m) ? 1ull : 0ull};
+  }
+};
+
+template <typename InputIterT, typename OutputIterT>
+struct write_output_op {
+  std::uint64_t m;
+  InputIterT in;
+  OutputIterT out;
+  // flag contains inclusive scan of valid keys
+  // perform gather using valid keys
+  __thrust_exec_check_disable__
+  __host__ __device__ std::size_t operator()(key_flag_tuple x) {
+    if (x.key < m) {
+      // -1 because inclusive scan
+      out[x.flag - 1] = in[x.key];
+    }
+    return 0;  // Discarded
+  }
+};
+
+template <typename ExecutionPolicy, typename RandomIterator, typename URBG>
+__host__ __device__ void shuffle(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, URBG&& g) {
+  using InputType = typename thrust::iterator_value_t<RandomIterator>;
+
+  // copy input to temp buffer
+  thrust::detail::temporary_array<InputType, ExecutionPolicy> temp(exec, first,
+                                                                   last);
+  thrust::shuffle_copy(exec, temp.begin(), temp.end(), first, g);
+}
+
+template <typename ExecutionPolicy, typename RandomIterator,
+          typename OutputIterator, typename URBG>
+__host__ __device__ void shuffle_copy(
+    thrust::execution_policy<ExecutionPolicy>& exec, RandomIterator first,
+    RandomIterator last, OutputIterator result, URBG&& g) {
+  // m is the length of the input
+  // we have an available bijection of length n via a feistel cipher
+  std::size_t m = last - first;
+  feistel_bijection bijection(m, g);
+  std::uint64_t n = bijection.nearest_power_of_two();
+
+  // perform stream compaction over length n bijection to get length m
+  // pseudorandom bijection over the original input
+  thrust::counting_iterator<std::uint64_t> indices(0);
+  thrust::transform_iterator<construct_key_flag_op, decltype(indices),
+                             key_flag_tuple>
+      key_flag_it(indices, construct_key_flag_op(m, bijection));
+  write_output_op<RandomIterator, decltype(result)> write_functor{m, first,
+                                                                  result};
+  auto gather_output_it = thrust::make_transform_output_iterator(
+      thrust::discard_iterator<std::size_t>(), write_functor);
+  // the feistel_bijection outputs a stream of permuted indices in range [0,n)
+  // flag each value < m and compact it, so we have a set of permuted indices in
+  // range [0,m) each thread gathers an input element according to its
+  // pseudorandom permuted index
+  thrust::inclusive_scan(exec, key_flag_it, key_flag_it + n, gather_output_it,
+                         key_flag_scan_op());
+}
+
+}  // end namespace generic
+}  // end namespace detail
+}  // end namespace system
+THRUST_NAMESPACE_END
diff --git a/thrust/system/detail/generic/sort.h b/thrust/system/detail/generic/sort.h
index 9d4ac1998..cd8d45562 100644
--- a/thrust/system/detail/generic/sort.h
+++ b/thrust/system/detail/generic/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -148,7 +147,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/sort.inl>
 
diff --git a/thrust/system/detail/generic/sort.inl b/thrust/system/detail/generic/sort.inl
index fa215a432..632cab435 100644
--- a/thrust/system/detail/generic/sort.inl
+++ b/thrust/system/detail/generic/sort.inl
@@ -28,8 +28,7 @@
 #include <thrust/tuple.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -184,12 +183,14 @@ template<typename DerivedPolicy,
          typename StrictWeakOrdering>
 __host__ __device__
   void stable_sort(thrust::execution_policy<DerivedPolicy> &,
-                   RandomAccessIterator first,
-                   RandomAccessIterator last,
-                   StrictWeakOrdering comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value) );
+                   RandomAccessIterator,
+                   RandomAccessIterator,
+                   StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator, false>::value)
+  , "unimplemented for this system"
+  );
 } // end stable_sort()
 
 
@@ -199,18 +200,20 @@ template<typename DerivedPolicy,
          typename StrictWeakOrdering>
 __host__ __device__
   void stable_sort_by_key(thrust::execution_policy<DerivedPolicy> &,
-                          RandomAccessIterator1 keys_first,
-                          RandomAccessIterator1 keys_last,
-                          RandomAccessIterator2 values_first,
-                          StrictWeakOrdering comp)
-{
-  // unimplemented primitive
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1, false>::value) );
+                          RandomAccessIterator1,
+                          RandomAccessIterator1,
+                          RandomAccessIterator2,
+                          StrictWeakOrdering)
+{
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<RandomAccessIterator1, false>::value)
+  , "unimplemented for this system"
+  );
 } // end stable_sort_by_key()
 
 
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/swap_ranges.h b/thrust/system/detail/generic/swap_ranges.h
index 78769715c..edb5acf31 100644
--- a/thrust/system/detail/generic/swap_ranges.h
+++ b/thrust/system/detail/generic/swap_ranges.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -41,7 +40,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/swap_ranges.inl>
 
diff --git a/thrust/system/detail/generic/swap_ranges.inl b/thrust/system/detail/generic/swap_ranges.inl
index 81977adc2..ea42df35b 100644
--- a/thrust/system/detail/generic/swap_ranges.inl
+++ b/thrust/system/detail/generic/swap_ranges.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/swap_ranges.h>
 #include <thrust/tuple.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/for_each.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -74,5 +75,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/tabulate.h b/thrust/system/detail/generic/tabulate.h
index 5cb75e928..041093e82 100644
--- a/thrust/system/detail/generic/tabulate.h
+++ b/thrust/system/detail/generic/tabulate.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -43,7 +42,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/tabulate.inl>
 
diff --git a/thrust/system/detail/generic/tabulate.inl b/thrust/system/detail/generic/tabulate.inl
index 1a740d26a..0fd2121c1 100644
--- a/thrust/system/detail/generic/tabulate.inl
+++ b/thrust/system/detail/generic/tabulate.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tabulate.h>
 #include <thrust/iterator/iterator_traits.h>
@@ -21,8 +23,7 @@
 #include <thrust/distance.h>
 #include <thrust/iterator/counting_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -55,6 +56,6 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/detail/generic/tag.h b/thrust/system/detail/generic/tag.h
index 4da1e79ce..48f094797 100644
--- a/thrust/system/detail/generic/tag.h
+++ b/thrust/system/detail/generic/tag.h
@@ -23,8 +23,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -44,5 +43,5 @@ struct tag
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/temporary_buffer.h b/thrust/system/detail/generic/temporary_buffer.h
index 953401139..6b7e01ff2 100644
--- a/thrust/system/detail/generic/temporary_buffer.h
+++ b/thrust/system/detail/generic/temporary_buffer.h
@@ -21,8 +21,7 @@
 #include <thrust/pair.h>
 #include <thrust/detail/pointer.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -37,6 +36,13 @@ __host__ __device__
     get_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, typename thrust::pointer<T,DerivedPolicy>::difference_type n);
 
 
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t n);
+
+
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
   void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p);
@@ -45,7 +51,7 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/temporary_buffer.inl>
 
diff --git a/thrust/system/detail/generic/temporary_buffer.inl b/thrust/system/detail/generic/temporary_buffer.inl
index 838d013bc..254c48cb9 100644
--- a/thrust/system/detail/generic/temporary_buffer.inl
+++ b/thrust/system/detail/generic/temporary_buffer.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/temporary_buffer.h>
 #include <thrust/detail/pointer.h>
 #include <thrust/detail/malloc_and_free.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -47,10 +48,33 @@ __host__ __device__
 } // end get_temporary_buffer()
 
 
+__thrust_exec_check_disable__
+template<typename DerivedPolicy, typename Pointer>
+__host__ __device__
+  void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p, std::ptrdiff_t)
+{
+  // If we are here, no user customization of the three-argument signature with
+  // a size parameter of `return_temporary_buffer` was found. There may be an
+  // old two-argument signature `return_temporary_buffer` though, so we make
+  // another ADL call to try and find one.
+  //
+  // The interface layer downcast and then did ADL dispatch - there were no
+  // matches for DerivedPolicy (aka no one customized the three-argument
+  // signature), so this overload got found an implicit upcast to
+  // `execution_policy<DerivedPolicy>` was done. Now, we're looking for a
+  // customization of the two-argument signature so we need to downcast again.
+  return_temporary_buffer(thrust::detail::derived_cast(thrust::detail::strip_const(exec)), p);
+} // end return_temporary_buffer()
+
+
+__thrust_exec_check_disable__
 template<typename DerivedPolicy, typename Pointer>
 __host__ __device__
   void return_temporary_buffer(thrust::execution_policy<DerivedPolicy> &exec, Pointer p)
 {
+  // If we are here, no user customization of either the old two-argument
+  // signature or the new three-argument signature with a size parameter of
+  // `return_temporary_buffer` was found.
   thrust::free(exec, p);
 } // end return_temporary_buffer()
 
@@ -58,5 +82,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform.h b/thrust/system/detail/generic/transform.h
index 1aa2f4993..30e032696 100644
--- a/thrust/system/detail/generic/transform.h
+++ b/thrust/system/detail/generic/transform.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -100,7 +99,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform.inl>
 
diff --git a/thrust/system/detail/generic/transform.inl b/thrust/system/detail/generic/transform.inl
index 589eb65c7..122c42580 100644
--- a/thrust/system/detail/generic/transform.inl
+++ b/thrust/system/detail/generic/transform.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/transform.h>
 #include <thrust/for_each.h>
@@ -23,8 +25,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/internal_functional.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -186,5 +187,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform_reduce.h b/thrust/system/detail/generic/transform_reduce.h
index 23123fa49..af510296e 100644
--- a/thrust/system/detail/generic/transform_reduce.h
+++ b/thrust/system/detail/generic/transform_reduce.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -47,7 +46,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform_reduce.inl>
 
diff --git a/thrust/system/detail/generic/transform_reduce.inl b/thrust/system/detail/generic/transform_reduce.inl
index 7340f8355..539c3b22c 100644
--- a/thrust/system/detail/generic/transform_reduce.inl
+++ b/thrust/system/detail/generic/transform_reduce.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/transform_reduce.h>
 #include <thrust/reduce.h>
 #include <thrust/iterator/transform_iterator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -30,8 +31,8 @@ namespace generic
 
 
 template<typename DerivedPolicy,
-         typename InputIterator, 
-         typename UnaryFunction, 
+         typename InputIterator,
+         typename UnaryFunction,
          typename OutputType,
          typename BinaryFunction>
 __host__ __device__
@@ -52,5 +53,5 @@ __host__ __device__
 } // end generic
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/transform_scan.h b/thrust/system/detail/generic/transform_scan.h
index 3f81434fc..05054c965 100644
--- a/thrust/system/detail/generic/transform_scan.h
+++ b/thrust/system/detail/generic/transform_scan.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -62,7 +61,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/transform_scan.inl>
 
diff --git a/thrust/system/detail/generic/transform_scan.inl b/thrust/system/detail/generic/transform_scan.inl
index 886fcc122..c9c976687 100644
--- a/thrust/system/detail/generic/transform_scan.inl
+++ b/thrust/system/detail/generic/transform_scan.inl
@@ -18,15 +18,15 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/detail/generic/transform_scan.h>
-#include <thrust/scan.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/detail/type_traits/function_traits.h>
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+#include <thrust/system/detail/generic/transform_scan.h>
+#include <thrust/type_traits/remove_cvref.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -48,27 +48,10 @@ __host__ __device__
                                           UnaryFunction unary_op,
                                           BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using InputType = typename thrust::iterator_value<InputIterator>::type;
+  using ResultType = thrust::detail::invoke_result_t<UnaryFunction, InputType>;
+  using ValueType = thrust::remove_cvref_t<ResultType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -81,7 +64,7 @@ template<typename ExecutionPolicy,
          typename InputIterator,
          typename OutputIterator,
          typename UnaryFunction,
-         typename T,
+         typename InitialValueType,
          typename AssociativeOperator>
 __host__ __device__
   OutputIterator transform_exclusive_scan(thrust::execution_policy<ExecutionPolicy> &exec,
@@ -89,30 +72,11 @@ __host__ __device__
                                           InputIterator last,
                                           OutputIterator result,
                                           UnaryFunction unary_op,
-                                          T init,
+                                          InitialValueType init,
                                           AssociativeOperator binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if UnaryFunction is AdaptableUnaryFunction
-  //   TemporaryType = AdaptableUnaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<UnaryFunction>::type
-
-  typedef typename thrust::detail::eval_if<
-    thrust::detail::has_result_type<UnaryFunction>::value,
-    thrust::detail::result_type<UnaryFunction>,
-    thrust::detail::eval_if<
-      thrust::detail::is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = thrust::remove_cvref_t<InitialValueType>;
 
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _first(first, unary_op);
   thrust::transform_iterator<UnaryFunction, InputIterator, ValueType> _last(last, unary_op);
@@ -124,5 +88,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/uninitialized_copy.h b/thrust/system/detail/generic/uninitialized_copy.h
index 2d1b0010d..bac5bcf96 100644
--- a/thrust/system/detail/generic/uninitialized_copy.h
+++ b/thrust/system/detail/generic/uninitialized_copy.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/uninitialized_copy.inl>
 
diff --git a/thrust/system/detail/generic/uninitialized_copy.inl b/thrust/system/detail/generic/uninitialized_copy.inl
index d6babf65c..679d1f6ba 100644
--- a/thrust/system/detail/generic/uninitialized_copy.inl
+++ b/thrust/system/detail/generic/uninitialized_copy.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/uninitialized_copy.h>
 #include <thrust/copy.h>
@@ -22,8 +24,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -189,5 +190,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/uninitialized_fill.h b/thrust/system/detail/generic/uninitialized_fill.h
index 6acc65d08..4f5404508 100644
--- a/thrust/system/detail/generic/uninitialized_fill.h
+++ b/thrust/system/detail/generic/uninitialized_fill.h
@@ -20,8 +20,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +50,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/uninitialized_fill.inl>
 
diff --git a/thrust/system/detail/generic/uninitialized_fill.inl b/thrust/system/detail/generic/uninitialized_fill.inl
index 0d4cf3f54..062414945 100644
--- a/thrust/system/detail/generic/uninitialized_fill.inl
+++ b/thrust/system/detail/generic/uninitialized_fill.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/uninitialized_fill.h>
 #include <thrust/fill.h>
@@ -21,8 +23,7 @@
 #include <thrust/detail/type_traits.h>
 #include <thrust/iterator/iterator_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -130,5 +131,5 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/unique.h b/thrust/system/detail/generic/unique.h
index 04388cbc0..ce3bff884 100644
--- a/thrust/system/detail/generic/unique.h
+++ b/thrust/system/detail/generic/unique.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/generic/tag.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,10 +68,30 @@ OutputIterator unique_copy(thrust::execution_policy<DerivedPolicy> &exec,
                            BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/unique.inl>
 
diff --git a/thrust/system/detail/generic/unique.inl b/thrust/system/detail/generic/unique.inl
index f5a6d644c..bb66e3585 100644
--- a/thrust/system/detail/generic/unique.inl
+++ b/thrust/system/detail/generic/unique.inl
@@ -14,11 +14,6 @@
  *  limitations under the License.
  */
 
-
-/*! \file unique.inl
- *  \brief Inline file for unique.h.
- */
-
 #pragma once
 
 #include <thrust/detail/config.h>
@@ -29,11 +24,12 @@
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/copy_if.h>
+#include <thrust/detail/count.h>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
+#include <thrust/detail/range/head_flags.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -65,9 +61,9 @@ __host__ __device__
                          BinaryPredicate binary_pred)
 {
   typedef typename thrust::iterator_traits<ForwardIterator>::value_type InputType;
-  
+
   thrust::detail::temporary_array<InputType,DerivedPolicy> input(exec, first, last);
-  
+
   return thrust::unique_copy(exec, input.begin(), input.end(), first, binary_pred);
 } // end unique()
 
@@ -97,22 +93,47 @@ __host__ __device__
                              OutputIterator output,
                              BinaryPredicate binary_pred)
 {
-  // empty sequence
-  if(first == last)
-    return output;
-  
-  thrust::detail::temporary_array<int,DerivedPolicy> stencil(exec, thrust::distance(first, last));
+  thrust::detail::head_flags<InputIterator, BinaryPredicate> stencil(first, last, binary_pred);
+
+  using namespace thrust::placeholders;
+
+  return thrust::copy_if(exec, first, last, stencil.begin(), output, _1);
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  thrust::detail::head_flags<ForwardIterator, BinaryPredicate> stencil(first, last, binary_pred);
   
-  // mark first element in each group
-  stencil[0] = 1; 
-  thrust::transform(exec, first, last - 1, first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
+  using namespace thrust::placeholders;
   
-  return thrust::copy_if(exec, first, last, stencil.begin(), output, thrust::identity<int>());
+  return thrust::count_if(exec, stencil.begin(), stencil.end(), _1);
+} // end unique_copy()
+
+
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(thrust::execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last)
+{
+  typedef typename thrust::iterator_value<ForwardIterator>::type value_type;
+  return thrust::unique_count(exec, first, last, thrust::equal_to<value_type>());
 } // end unique_copy()
 
 
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/generic/unique_by_key.h b/thrust/system/detail/generic/unique_by_key.h
index cb03179de..0ea9e7cc8 100644
--- a/thrust/system/detail/generic/unique_by_key.h
+++ b/thrust/system/detail/generic/unique_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/tag.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -89,7 +88,7 @@ __host__ __device__
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/generic/unique_by_key.inl>
 
diff --git a/thrust/system/detail/generic/unique_by_key.inl b/thrust/system/detail/generic/unique_by_key.inl
index 89f2288da..ffcf1dd0c 100644
--- a/thrust/system/detail/generic/unique_by_key.inl
+++ b/thrust/system/detail/generic/unique_by_key.inl
@@ -26,9 +26,9 @@
 #include <thrust/detail/internal_functional.h>
 #include <thrust/detail/copy_if.h>
 #include <thrust/unique.h>
+#include <thrust/detail/range/head_flags.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,11 +40,12 @@ namespace generic
 template<typename ExecutionPolicy,
          typename ForwardIterator1,
          typename ForwardIterator2>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first)
+__host__ __device__
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+              ForwardIterator1 keys_first,
+              ForwardIterator1 keys_last,
+              ForwardIterator2 values_first)
 {
   typedef typename thrust::iterator_traits<ForwardIterator1>::value_type KeyType;
   return thrust::unique_by_key(exec, keys_first, keys_last, values_first, thrust::equal_to<KeyType>());
@@ -55,21 +56,22 @@ template<typename ExecutionPolicy,
          typename ForwardIterator1,
          typename ForwardIterator2,
          typename BinaryPredicate>
-  thrust::pair<ForwardIterator1,ForwardIterator2>
-    unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
-                  ForwardIterator1 keys_first, 
-                  ForwardIterator1 keys_last,
-                  ForwardIterator2 values_first,
-                  BinaryPredicate binary_pred)
+__host__ __device__
+thrust::pair<ForwardIterator1,ForwardIterator2>
+unique_by_key(thrust::execution_policy<ExecutionPolicy> &exec,
+              ForwardIterator1 keys_first,
+              ForwardIterator1 keys_last,
+              ForwardIterator2 values_first,
+              BinaryPredicate binary_pred)
 {
   typedef typename thrust::iterator_traits<ForwardIterator1>::value_type InputType1;
   typedef typename thrust::iterator_traits<ForwardIterator2>::value_type InputType2;
-  
+
   ForwardIterator2 values_last = values_first + (keys_last - keys_first);
-  
+
   thrust::detail::temporary_array<InputType1,ExecutionPolicy> keys(exec, keys_first, keys_last);
   thrust::detail::temporary_array<InputType2,ExecutionPolicy> vals(exec, values_first, values_last);
-  
+
   return thrust::unique_by_key_copy(exec, keys.begin(), keys.end(), vals.begin(), keys_first, values_first, binary_pred);
 } // end unique_by_key()
 
@@ -79,13 +81,14 @@ template<typename ExecutionPolicy,
          typename InputIterator2,
          typename OutputIterator1,
          typename OutputIterator2>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output)
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 keys_first,
+                   InputIterator1 keys_last,
+                   InputIterator2 values_first,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output)
 {
   typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
   return thrust::unique_by_key_copy(exec, keys_first, keys_last, values_first, keys_output, values_output, thrust::equal_to<KeyType>());
@@ -98,39 +101,33 @@ template<typename ExecutionPolicy,
          typename OutputIterator1,
          typename OutputIterator2,
          typename BinaryPredicate>
-  thrust::pair<OutputIterator1,OutputIterator2>
-    unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
-                       InputIterator1 keys_first, 
-                       InputIterator1 keys_last,
-                       InputIterator2 values_first,
-                       OutputIterator1 keys_output,
-                       OutputIterator2 values_output,
-                       BinaryPredicate binary_pred)
+__host__ __device__
+thrust::pair<OutputIterator1,OutputIterator2>
+unique_by_key_copy(thrust::execution_policy<ExecutionPolicy> &exec,
+                   InputIterator1 keys_first,
+                   InputIterator1 keys_last,
+                   InputIterator2 values_first,
+                   OutputIterator1 keys_output,
+                   OutputIterator2 values_output,
+                   BinaryPredicate binary_pred)
 {
   typedef typename thrust::iterator_traits<InputIterator1>::difference_type difference_type;
-  
-  // empty sequence
-  if(keys_first == keys_last)
-    return thrust::make_pair(keys_output, values_output);
-  
+
   difference_type n = thrust::distance(keys_first, keys_last);
-  
-  thrust::detail::temporary_array<int,ExecutionPolicy> stencil(exec,n);
-  
-  // mark first element in each group
-  stencil[0] = 1; 
-  thrust::transform(exec, keys_first, keys_last - 1, keys_first + 1, stencil.begin() + 1, thrust::detail::not2(binary_pred)); 
-  
+
+  thrust::detail::head_flags<InputIterator1, BinaryPredicate> stencil(keys_first, keys_last, binary_pred);
+
+  using namespace thrust::placeholders;
   thrust::zip_iterator< thrust::tuple<OutputIterator1, OutputIterator2> > result =
     thrust::copy_if(exec,
                     thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)),
                     thrust::make_zip_iterator(thrust::make_tuple(keys_first, values_first)) + n,
                     stencil.begin(),
                     thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output)),
-                    thrust::identity<int>());
-  
+                    _1);
+
   difference_type output_size = result - thrust::make_zip_iterator(thrust::make_tuple(keys_output, values_output));
-                                  
+
   return thrust::make_pair(keys_output + output_size, values_output + output_size);
 } // end unique_by_key_copy()
 
@@ -138,5 +135,5 @@ template<typename ExecutionPolicy,
 } // end namespace generic
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/internal/decompose.h b/thrust/system/detail/internal/decompose.h
index e949f2024..58af7c551 100644
--- a/thrust/system/detail/internal/decompose.h
+++ b/thrust/system/detail/internal/decompose.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -110,5 +109,5 @@ namespace internal
 } // end namespace internal
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/adjacent_difference.h b/thrust/system/detail/sequential/adjacent_difference.h
index c6b0ee1b2..4a9dad82c 100644
--- a/thrust/system/detail/sequential/adjacent_difference.h
+++ b/thrust/system/detail/sequential/adjacent_difference.h
@@ -25,8 +25,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -70,5 +69,5 @@ OutputIterator adjacent_difference(sequential::execution_policy<DerivedPolicy> &
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/assign_value.h b/thrust/system/detail/sequential/assign_value.h
index 699bcbcd7..0eb145d13 100644
--- a/thrust/system/detail/sequential/assign_value.h
+++ b/thrust/system/detail/sequential/assign_value.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/sequential/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -39,5 +38,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/binary_search.h b/thrust/system/detail/sequential/binary_search.h
index 54534143e..2da5080f4 100644
--- a/thrust/system/detail/sequential/binary_search.h
+++ b/thrust/system/detail/sequential/binary_search.h
@@ -21,13 +21,14 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/advance.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -153,5 +154,5 @@ bool binary_search(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy.h b/thrust/system/detail/sequential/copy.h
index 80853f670..0dd2cdad5 100644
--- a/thrust/system/detail/sequential/copy.h
+++ b/thrust/system/detail/sequential/copy.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -57,7 +56,7 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/copy.inl>
 
diff --git a/thrust/system/detail/sequential/copy.inl b/thrust/system/detail/sequential/copy.inl
index 955986d63..850f20f1e 100644
--- a/thrust/system/detail/sequential/copy.inl
+++ b/thrust/system/detail/sequential/copy.inl
@@ -14,17 +14,18 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/copy.h>
-#include <thrust/detail/dispatch/is_trivial_copy.h>
 #include <thrust/detail/type_traits.h>
 #include <thrust/system/detail/sequential/general_copy.h>
 #include <thrust/system/detail/sequential/trivial_copy.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/type_traits/pointer_traits.h>
+#include <thrust/type_traits/is_trivially_relocatable.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,7 +53,7 @@ __host__ __device__
   OutputIterator copy(InputIterator first,
                       InputIterator last,
                       OutputIterator result,
-                      thrust::detail::true_type)  // is_trivial_copy
+                      thrust::detail::true_type)  // is_indirectly_trivially_relocatable_to
 {
   typedef typename thrust::iterator_difference<InputIterator>::type Size;
 
@@ -69,7 +70,7 @@ __host__ __device__
   OutputIterator copy(InputIterator first,
                       InputIterator last,
                       OutputIterator result,
-                      thrust::detail::false_type)  // is_trivial_copy
+                      thrust::detail::false_type)  // is_indirectly_trivially_relocatable_to
 {
   return thrust::system::detail::sequential::general_copy(first,last,result);
 } // end copy()
@@ -83,13 +84,14 @@ __host__ __device__
   OutputIterator copy_n(InputIterator first,
                         Size n,
                         OutputIterator result,
-                        thrust::detail::true_type)  // is_trivial_copy
+                        thrust::detail::true_type)  // is_indirectly_trivially_relocatable_to
 {
   thrust::system::detail::sequential::trivial_copy_n(get(&*first), n, get(&*result));
   return result + n;
 } // end copy_n()
 
 
+__thrust_exec_check_disable__
 template<typename InputIterator,
          typename Size,
          typename OutputIterator>
@@ -97,7 +99,7 @@ __host__ __device__
   OutputIterator copy_n(InputIterator first,
                         Size n,
                         OutputIterator result,
-                        thrust::detail::false_type)  // is_trivial_copy
+                        thrust::detail::false_type)  // is_indirectly_trivially_relocatable_to
 {
   return thrust::system::detail::sequential::general_copy_n(first,n,result);
 } // end copy_n()
@@ -117,10 +119,11 @@ __host__ __device__
                       OutputIterator result)
 {
   return thrust::system::detail::sequential::copy_detail::copy(first, last, result,
-    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
+    typename thrust::is_indirectly_trivially_relocatable_to<InputIterator,OutputIterator>::type());
 } // end copy()
 
 
+__thrust_exec_check_disable__
 template<typename DerivedPolicy,
          typename InputIterator,
          typename Size,
@@ -132,12 +135,12 @@ __host__ __device__
                         OutputIterator result)
 {
   return thrust::system::detail::sequential::copy_detail::copy_n(first, n, result,
-    typename thrust::detail::dispatch::is_trivial_copy<InputIterator,OutputIterator>::type());
+    typename thrust::is_indirectly_trivially_relocatable_to<InputIterator,OutputIterator>::type());
 } // end copy_n()
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy_backward.h b/thrust/system/detail/sequential/copy_backward.h
index e825436b1..d127ac80d 100644
--- a/thrust/system/detail/sequential/copy_backward.h
+++ b/thrust/system/detail/sequential/copy_backward.h
@@ -18,8 +18,7 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,5 +49,5 @@ BidirectionalIterator2 copy_backward(BidirectionalIterator1 first,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/copy_if.h b/thrust/system/detail/sequential/copy_if.h
index bb29ccdeb..3c00956de 100644
--- a/thrust/system/detail/sequential/copy_if.h
+++ b/thrust/system/detail/sequential/copy_if.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,5 +68,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/execution_policy.h b/thrust/system/detail/sequential/execution_policy.h
index 7b5f69666..99d78fc27 100644
--- a/thrust/system/detail/sequential/execution_policy.h
+++ b/thrust/system/detail/sequential/execution_policy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ template<>
 // tag's definition comes before the generic definition of execution_policy
 struct tag : execution_policy<tag>
 {
-  __host__ __device__ tag() {}
+  __host__ __device__ constexpr tag() {}
 };
 
 // allow conversion to tag when it is not a successor
@@ -66,15 +65,11 @@ template<typename Derived>
 };
 
 
-#ifdef __CUDA_ARCH__
-static const __device__ tag seq;
-#else
-static const tag seq;
-#endif
+THRUST_INLINE_CONSTANT tag seq;
 
 
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/extrema.h b/thrust/system/detail/sequential/extrema.h
index 7bfa5a17d..5e5c62da6 100644
--- a/thrust/system/detail/sequential/extrema.h
+++ b/thrust/system/detail/sequential/extrema.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -135,5 +134,5 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(sequential::executi
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/find.h b/thrust/system/detail/sequential/find.h
index 5e551b74a..54c238c71 100644
--- a/thrust/system/detail/sequential/find.h
+++ b/thrust/system/detail/sequential/find.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -67,5 +66,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/for_each.h b/thrust/system/detail/sequential/for_each.h
index 6e83d18c1..7058c56f2 100644
--- a/thrust/system/detail/sequential/for_each.h
+++ b/thrust/system/detail/sequential/for_each.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -91,5 +90,5 @@ InputIterator for_each_n(sequential::execution_policy<DerivedPolicy> &,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/general_copy.h b/thrust/system/detail/sequential/general_copy.h
index 9546b72e5..6ea87bbac 100644
--- a/thrust/system/detail/sequential/general_copy.h
+++ b/thrust/system/detail/sequential/general_copy.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/raw_reference_cast.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -143,5 +142,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/get_value.h b/thrust/system/detail/sequential/get_value.h
index 5f3f8eb04..90752d867 100644
--- a/thrust/system/detail/sequential/get_value.h
+++ b/thrust/system/detail/sequential/get_value.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/sequential/execution_policy.h>
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,5 +41,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/insertion_sort.h b/thrust/system/detail/sequential/insertion_sort.h
index f0bb9bc5f..9acccd8e9 100644
--- a/thrust/system/detail/sequential/insertion_sort.h
+++ b/thrust/system/detail/sequential/insertion_sort.h
@@ -22,8 +22,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/copy_backward.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -149,5 +148,5 @@ void insertion_sort_by_key(RandomAccessIterator1 first1,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/iter_swap.h b/thrust/system/detail/sequential/iter_swap.h
index f777f63a3..7a5c481fc 100644
--- a/thrust/system/detail/sequential/iter_swap.h
+++ b/thrust/system/detail/sequential/iter_swap.h
@@ -21,8 +21,7 @@
 #include <thrust/detail/raw_pointer_cast.h>
 #include <thrust/detail/swap.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -31,9 +30,9 @@ namespace sequential
 {
 
 
-template<typename Pointer1, typename Pointer2>
+template<typename DerivedPolicy, typename Pointer1, typename Pointer2>
 __host__ __device__
-  void iter_swap(tag, Pointer1 a, Pointer2 b)
+  void iter_swap(sequential::execution_policy<DerivedPolicy> &, Pointer1 a, Pointer2 b)
 {
   using thrust::swap;
   swap(*thrust::raw_pointer_cast(a), *thrust::raw_pointer_cast(b));
@@ -43,5 +42,5 @@ __host__ __device__
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/malloc_and_free.h b/thrust/system/detail/sequential/malloc_and_free.h
index a54ddf0a9..b250140e0 100644
--- a/thrust/system/detail/sequential/malloc_and_free.h
+++ b/thrust/system/detail/sequential/malloc_and_free.h
@@ -21,8 +21,7 @@
 #include <cstdlib> // for malloc & free
 #include <thrust/detail/raw_pointer_cast.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -35,11 +34,7 @@ template<typename DerivedPolicy>
 inline __host__ __device__
 void *malloc(execution_policy<DerivedPolicy> &, std::size_t n)
 {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200)
   return std::malloc(n);
-#else
-  return 0;
-#endif
 } // end mallc()
 
 
@@ -47,14 +42,12 @@ template<typename DerivedPolicy, typename Pointer>
 inline __host__ __device__
 void free(sequential::execution_policy<DerivedPolicy> &, Pointer ptr)
 {
-#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 200)
   std::free(thrust::raw_pointer_cast(ptr));
-#endif
 } // end mallc()
 
 
 } // end sequential
 } // end detail
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/merge.h b/thrust/system/detail/sequential/merge.h
index 6cd314dc7..a45e18004 100644
--- a/thrust/system/detail/sequential/merge.h
+++ b/thrust/system/detail/sequential/merge.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -74,7 +73,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/merge.inl>
 
diff --git a/thrust/system/detail/sequential/merge.inl b/thrust/system/detail/sequential/merge.inl
index b3a7e8a81..08d7c0b0d 100644
--- a/thrust/system/detail/sequential/merge.inl
+++ b/thrust/system/detail/sequential/merge.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/merge.h>
 #include <thrust/detail/copy.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -82,7 +83,7 @@ template<typename DerivedPolicy,
          typename StrictWeakOrdering>
 __host__ __device__
 thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(sequential::execution_policy<DerivedPolicy> &exec,
+  merge_by_key(sequential::execution_policy<DerivedPolicy> &,
                InputIterator1 keys_first1,
                InputIterator1 keys_last1,
                InputIterator2 keys_first2,
@@ -149,5 +150,5 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/partition.h b/thrust/system/detail/sequential/partition.h
index 66996d637..43d5b0e23 100644
--- a/thrust/system/detail/sequential/partition.h
+++ b/thrust/system/detail/sequential/partition.h
@@ -27,8 +27,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace detail
 {
 
@@ -95,7 +94,8 @@ __host__ __device__
   {
     if(wrapped_pred(*next))
     {
-      iter_swap(first, next);
+      // Fully qualify name to disambiguate overloads found via ADL.
+      THRUST_NS_QUALIFIER::system::detail::sequential::iter_swap(first, next);
       ++first;
     }
   }
@@ -143,7 +143,8 @@ __host__ __device__
   {
     if(wrapped_pred(*stencil_first))
     {
-      iter_swap(first, next);
+      // Fully qualify name to disambiguate overloads found via ADL.
+      THRUST_NS_QUALIFIER::system::detail::sequential::iter_swap(first, next);
       ++first;
     }
 
@@ -335,5 +336,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/per_device_resource.h b/thrust/system/detail/sequential/per_device_resource.h
new file mode 100644
index 000000000..1b8d61f92
--- /dev/null
+++ b/thrust/system/detail/sequential/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/system/detail/sequential/reduce.h b/thrust/system/detail/sequential/reduce.h
index 55e92acb9..a532f71b2 100644
--- a/thrust/system/detail/sequential/reduce.h
+++ b/thrust/system/detail/sequential/reduce.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -69,5 +68,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/reduce_by_key.h b/thrust/system/detail/sequential/reduce_by_key.h
index f19e62a29..ef17ac5b0 100644
--- a/thrust/system/detail/sequential/reduce_by_key.h
+++ b/thrust/system/detail/sequential/reduce_by_key.h
@@ -19,11 +19,9 @@
 #include <thrust/detail/config.h>
 #include <thrust/pair.h>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -54,11 +52,8 @@ __host__ __device__
   typedef typename thrust::iterator_traits<InputIterator1>::value_type  InputKeyType;
   typedef typename thrust::iterator_traits<InputIterator2>::value_type  InputValueType;
 
-  typedef typename thrust::detail::intermediate_type_from_function_and_iterators<
-    InputIterator2,
-    OutputIterator2,
-    BinaryFunction
-  >::type TemporaryType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using TemporaryType = typename thrust::iterator_value<InputIterator2>::type;
 
   if(keys_first != keys_last)
   {
@@ -103,5 +98,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/remove.h b/thrust/system/detail/sequential/remove.h
index 48de522df..df564f15b 100644
--- a/thrust/system/detail/sequential/remove.h
+++ b/thrust/system/detail/sequential/remove.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -198,5 +197,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/scan.h b/thrust/system/detail/sequential/scan.h
index dce18c6b6..c5fce2475 100644
--- a/thrust/system/detail/sequential/scan.h
+++ b/thrust/system/detail/sequential/scan.h
@@ -29,8 +29,7 @@
 #include <thrust/detail/type_traits/iterator/is_output_iterator.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,29 +50,10 @@ __host__ __device__
                                 OutputIterator result,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-  
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
 
   // wrap binary_op
   thrust::detail::wrapped_function<
@@ -85,7 +65,7 @@ __host__ __device__
   {
     ValueType sum = *first;
 
-    *result = sum;
+    *result = *first;
 
     for(++first, ++result; first != last; ++first, ++result)
       *result = sum = wrapped_binary_op(sum,*first);
@@ -99,39 +79,20 @@ __thrust_exec_check_disable__
 template<typename DerivedPolicy,
          typename InputIterator,
          typename OutputIterator,
-         typename T,
+         typename InitialValueType,
          typename BinaryFunction>
 __host__ __device__
   OutputIterator exclusive_scan(sequential::execution_policy<DerivedPolicy> &,
                                 InputIterator first,
                                 InputIterator last,
                                 OutputIterator result,
-                                T init,
+                                InitialValueType init,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
 
   if(first != last)
   {
@@ -156,5 +117,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/scan_by_key.h b/thrust/system/detail/sequential/scan_by_key.h
index 1e0471b37..c428c1050 100644
--- a/thrust/system/detail/sequential/scan_by_key.h
+++ b/thrust/system/detail/sequential/scan_by_key.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -52,8 +51,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        BinaryFunction binary_op)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+  using KeyType = typename thrust::iterator_traits<InputIterator1>::value_type;
+  using ValueType = typename thrust::iterator_traits<InputIterator2>::value_type;
 
   // wrap binary_op
   thrust::detail::wrapped_function<
@@ -105,8 +104,8 @@ __host__ __device__
                                        BinaryPredicate binary_pred,
                                        BinaryFunction binary_op)
 {
-  typedef typename thrust::iterator_traits<InputIterator1>::value_type KeyType;
-  typedef typename thrust::iterator_traits<OutputIterator>::value_type ValueType;
+  using KeyType = typename thrust::iterator_traits<InputIterator1>::value_type;
+  using ValueType = T;
 
   if(first1 != last1)
   {
@@ -146,5 +145,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/set_operations.h b/thrust/system/detail/sequential/set_operations.h
index a9b1cc688..678754b45 100644
--- a/thrust/system/detail/sequential/set_operations.h
+++ b/thrust/system/detail/sequential/set_operations.h
@@ -26,8 +26,7 @@
 #include <thrust/detail/copy.h>
 #include <thrust/detail/function.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -220,5 +219,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/sort.h b/thrust/system/detail/sequential/sort.h
index 0900743d8..34cc7a8ba 100644
--- a/thrust/system/detail/sequential/sort.h
+++ b/thrust/system/detail/sequential/sort.h
@@ -23,8 +23,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -58,7 +57,7 @@ void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/sort.inl>
 
diff --git a/thrust/system/detail/sequential/sort.inl b/thrust/system/detail/sequential/sort.inl
index 3d8b6e773..241a860af 100644
--- a/thrust/system/detail/sequential/sort.inl
+++ b/thrust/system/detail/sequential/sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/reverse.h>
 #include <thrust/detail/type_traits.h>
@@ -21,8 +24,9 @@
 #include <thrust/system/detail/sequential/stable_merge_sort.h>
 #include <thrust/system/detail/sequential/stable_primitive_sort.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -54,11 +58,11 @@ __host__ __device__
 void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
                  RandomAccessIterator first,
                  RandomAccessIterator last,
-                 StrictWeakOrdering comp,
+                 StrictWeakOrdering,
                  thrust::detail::true_type)
 {
   thrust::system::detail::sequential::stable_primitive_sort(exec, first, last);
-        
+
   // if comp is greater<T> then reverse the keys
   typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
 
@@ -78,7 +82,7 @@ void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
                         RandomAccessIterator1 first1,
                         RandomAccessIterator1 last1,
                         RandomAccessIterator2 first2,
-                        StrictWeakOrdering comp,
+                        StrictWeakOrdering,
                         thrust::detail::true_type)
 {
   // if comp is greater<T> then reverse the keys and values
@@ -160,16 +164,16 @@ void stable_sort(sequential::execution_policy<DerivedPolicy> &exec,
                  RandomAccessIterator last,
                  StrictWeakOrdering comp)
 {
-  typedef typename thrust::iterator_traits<RandomAccessIterator>::value_type KeyType;
 
   // the compilation time of stable_primitive_sort is too expensive to use within a single CUDA thread
-#ifndef __CUDA_ARCH__
-  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
-#else
-  thrust::detail::false_type use_primitive_sort;
-#endif
-
-  sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  NV_IF_TARGET(NV_IS_HOST, (
+    using KeyType = thrust::iterator_value_t<RandomAccessIterator>;
+    sort_detail::use_primitive_sort<KeyType, StrictWeakOrdering> use_primitive_sort;
+    sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  ), ( // NV_IS_DEVICE:
+    thrust::detail::false_type use_primitive_sort;
+    sort_detail::stable_sort(exec, first, last, comp, use_primitive_sort);
+  ));
 }
 
 
@@ -184,21 +188,21 @@ void stable_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
                         RandomAccessIterator2 first2,
                         StrictWeakOrdering comp)
 {
-  typedef typename thrust::iterator_traits<RandomAccessIterator1>::value_type KeyType;
 
   // the compilation time of stable_primitive_sort_by_key is too expensive to use within a single CUDA thread
-#ifndef __CUDA_ARCH__
-  sort_detail::use_primitive_sort<KeyType,StrictWeakOrdering> use_primitive_sort;
-#else
-  thrust::detail::false_type use_primitive_sort;
-#endif
-
-  sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  NV_IF_TARGET(NV_IS_HOST, (
+    using KeyType = thrust::iterator_value_t<RandomAccessIterator1>;
+    sort_detail::use_primitive_sort<KeyType, StrictWeakOrdering> use_primitive_sort;
+    sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  ), ( // NV_IS_DEVICE:
+    thrust::detail::false_type use_primitive_sort;
+    sort_detail::stable_sort_by_key(exec, first1, last1, first2, comp, use_primitive_sort);
+  ));
 }
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_merge_sort.h b/thrust/system/detail/sequential/stable_merge_sort.h
index 359ba8d7b..64aa2bf96 100644
--- a/thrust/system/detail/sequential/stable_merge_sort.h
+++ b/thrust/system/detail/sequential/stable_merge_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -54,7 +53,7 @@ void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_merge_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_merge_sort.inl b/thrust/system/detail/sequential/stable_merge_sort.inl
index 8ba3bf908..02f384afb 100644
--- a/thrust/system/detail/sequential/stable_merge_sort.inl
+++ b/thrust/system/detail/sequential/stable_merge_sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/temporary_array.h>
@@ -21,8 +24,9 @@
 #include <thrust/system/detail/sequential/insertion_sort.h>
 #include <thrust/detail/minmax.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -97,7 +101,7 @@ void insertion_sort_each(RandomAccessIterator first,
   {
     for(; first < last; first += partition_size)
     {
-      RandomAccessIterator partition_last = thrust::min(last, first + partition_size);
+      RandomAccessIterator partition_last = (thrust::min)(last, first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort(first, partition_last, comp);
     } // end for
@@ -120,7 +124,7 @@ void insertion_sort_each_by_key(RandomAccessIterator1 keys_first,
   {
     for(; keys_first < keys_last; keys_first += partition_size, values_first += partition_size)
     {
-      RandomAccessIterator1 keys_partition_last = thrust::min(keys_last, keys_first + partition_size);
+      RandomAccessIterator1 keys_partition_last = (thrust::min)(keys_last, keys_first + partition_size);
 
       thrust::system::detail::sequential::insertion_sort_by_key(keys_first, keys_partition_last, values_first, comp);
     } // end for
@@ -143,8 +147,8 @@ void merge_adjacent_partitions(sequential::execution_policy<DerivedPolicy> &exec
 {
   for(; first < last; first += 2 * partition_size, result += 2 * partition_size)
   {
-    RandomAccessIterator1 interval_middle = thrust::min(last, first + partition_size);
-    RandomAccessIterator1 interval_last   = thrust::min(last, interval_middle + partition_size);
+    RandomAccessIterator1 interval_middle = (thrust::min)(last, first + partition_size);
+    RandomAccessIterator1 interval_last   = (thrust::min)(last, interval_middle + partition_size);
 
     thrust::merge(exec,
                   first, interval_middle,
@@ -178,8 +182,8 @@ void merge_adjacent_partitions_by_key(sequential::execution_policy<DerivedPolicy
       keys_first < keys_last;
       keys_first += stride, values_first += stride, keys_result += stride, values_result += stride)
   {
-    RandomAccessIterator1 keys_interval_middle = thrust::min(keys_last, keys_first + partition_size);
-    RandomAccessIterator1 keys_interval_last   = thrust::min(keys_last, keys_interval_middle + partition_size);
+    RandomAccessIterator1 keys_interval_middle = (thrust::min)(keys_last, keys_first + partition_size);
+    RandomAccessIterator1 keys_interval_last   = (thrust::min)(keys_last, keys_interval_middle + partition_size);
 
     RandomAccessIterator2 values_first2 = values_first + (keys_interval_middle - keys_first);
 
@@ -353,12 +357,12 @@ void stable_merge_sort(sequential::execution_policy<DerivedPolicy> &exec,
                        RandomAccessIterator last,
                        StrictWeakOrdering comp)
 {
-  // avoid recursion in CUDA threads
-#ifdef __CUDA_ARCH__
-  stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp);
-#else
-  stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp);
-#endif
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // avoid recursion in CUDA threads
+    stable_merge_sort_detail::iterative_stable_merge_sort(exec, first, last, comp);
+  ), (
+    stable_merge_sort_detail::recursive_stable_merge_sort(exec, first, last, comp);
+  ));
 }
 
 
@@ -373,17 +377,17 @@ void stable_merge_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
                               RandomAccessIterator2 first2,
                               StrictWeakOrdering comp)
 {
-  // avoid recursion in CUDA threads
-#ifdef __CUDA_ARCH__
-  stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
-#else
-  stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
-#endif
+  NV_IF_TARGET(NV_IS_DEVICE, (
+    // avoid recursion in CUDA threads
+    stable_merge_sort_detail::iterative_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+  ), (
+    stable_merge_sort_detail::recursive_stable_merge_sort_by_key(exec, first1, last1, first2, comp);
+  ));
 }
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_primitive_sort.h b/thrust/system/detail/sequential/stable_primitive_sort.h
index 3426f953a..acbb81217 100644
--- a/thrust/system/detail/sequential/stable_primitive_sort.h
+++ b/thrust/system/detail/sequential/stable_primitive_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &e
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_primitive_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_primitive_sort.inl b/thrust/system/detail/sequential/stable_primitive_sort.inl
index e5cea4ad3..9897d6798 100644
--- a/thrust/system/detail/sequential/stable_primitive_sort.inl
+++ b/thrust/system/detail/sequential/stable_primitive_sort.inl
@@ -24,8 +24,7 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -157,5 +156,5 @@ void stable_primitive_sort_by_key(sequential::execution_policy<DerivedPolicy> &e
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/stable_radix_sort.h b/thrust/system/detail/sequential/stable_radix_sort.h
index 9f7482ccf..1e9713a2c 100644
--- a/thrust/system/detail/sequential/stable_radix_sort.h
+++ b/thrust/system/detail/sequential/stable_radix_sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -50,7 +49,7 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/sequential/stable_radix_sort.inl>
 
diff --git a/thrust/system/detail/sequential/stable_radix_sort.inl b/thrust/system/detail/sequential/stable_radix_sort.inl
index 6e2132694..83d95ebfd 100644
--- a/thrust/system/detail/sequential/stable_radix_sort.inl
+++ b/thrust/system/detail/sequential/stable_radix_sort.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,8 +14,9 @@
  *  limitations under the License.
  */
 
+#pragma once
 
-#include <limits>
+#include <thrust/detail/config.h>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -26,8 +27,9 @@
 #include <thrust/detail/cstdint.h>
 #include <thrust/scatter.h>
 
-namespace thrust
-{
+#include <limits>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -51,7 +53,7 @@ struct RadixEncoder<char> : public thrust::unary_function<char, unsigned char>
   {
     if(std::numeric_limits<char>::is_signed)
     {
-      return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+      return static_cast<unsigned char>(x) ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
     }
     else
     {
@@ -66,7 +68,7 @@ struct RadixEncoder<signed char> : public thrust::unary_function<signed char, un
   __host__ __device__
   unsigned char operator()(signed char x) const
   {
-    return x ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
+    return static_cast<unsigned char>(x) ^ static_cast<unsigned char>(1) << (8 * sizeof(unsigned char) - 1);
   }
 };
 
@@ -76,7 +78,7 @@ struct RadixEncoder<short> : public thrust::unary_function<short, unsigned short
   __host__ __device__
   unsigned short operator()(short x) const
   {
-    return x ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
+    return static_cast<unsigned short>(x) ^ static_cast<unsigned short>(1) << (8 * sizeof(unsigned short) - 1);
   }
 };
 
@@ -242,9 +244,9 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
   const unsigned int NumHistograms = (8 * sizeof(EncodedType) + (RadixBits - 1)) / RadixBits;
   const unsigned int HistogramSize =  1 << RadixBits;
-  
+
   const EncodedType BitMask = static_cast<EncodedType>((1 << RadixBits) - 1);
-  
+
   Encoder encode;
 
   // storage for histograms
@@ -252,10 +254,10 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
   // see which passes can be eliminated
   bool skip_shuffle[NumHistograms] = {false};
-  
+
   // false if most recent data is stored in (keys1,vals1)
   bool flip = false;
-    
+
   // compute histograms
   for(size_t i = 0; i < N; i++)
   {
@@ -263,7 +265,7 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
 
     for(unsigned int j = 0; j < NumHistograms; j++)
     {
-      const EncodedType BitShift = RadixBits * j;
+      const auto BitShift = static_cast<EncodedType>(RadixBits * j);
       histograms[j][(x >> BitShift) & BitMask]++;
     }
   }
@@ -286,7 +288,7 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
     }
   }
 
-  // shuffle keys and (optionally) values 
+  // shuffle keys and (optionally) values
   for(unsigned int i = 0; i < NumHistograms; i++)
   {
     const EncodedType BitShift = static_cast<EncodedType>(RadixBits * i);
@@ -315,11 +317,11 @@ void radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
           radix_shuffle_n<RadixBits>(exec, keys1, N, keys2, BitShift, histograms[i]);
         }
       }
-        
+
       flip = (flip) ? false : true;
     }
   }
- 
+
   // ensure final values are in (keys1,vals1)
   if(flip)
   {
@@ -381,7 +383,13 @@ struct radix_sort_dispatcher<2>
                   RandomAccessIterator1 keys1, RandomAccessIterator2 keys2,
                   const size_t N)
   {
-    if(N < (1 << 16))
+#ifdef __QNX__
+    // XXX war for nvbug 200193674
+    const bool condition = true;
+#else
+    const bool condition = N < (1 << 16);
+#endif
+    if (condition)
     {
       radix_sort_detail::radix_sort<8,false>(exec, keys1, keys2, static_cast<int *>(0), static_cast<int *>(0), N);
     }
@@ -403,7 +411,13 @@ struct radix_sort_dispatcher<2>
                   RandomAccessIterator3 vals1, RandomAccessIterator4 vals2,
                   const size_t N)
   {
-    if(N < (1 << 15))
+#ifdef __QNX__
+    // XXX war for nvbug 200193674
+    const bool condition = true;
+#else
+    const bool condition = N < (1 << 15);
+#endif
+    if (condition)
     {
       radix_sort_detail::radix_sort<8,true>(exec, keys1, keys2, vals1, vals2, N);
     }
@@ -548,9 +562,9 @@ void stable_radix_sort(sequential::execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_value<RandomAccessIterator>::type KeyType;
 
   size_t N = last - first;
-  
+
   thrust::detail::temporary_array<KeyType, DerivedPolicy> temp(exec, N);
-  
+
   radix_sort_detail::radix_sort(exec, first, temp.begin(), N);
 }
 
@@ -568,7 +582,7 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_value<RandomAccessIterator2>::type ValueType;
 
   size_t N = last1 - first1;
-  
+
   thrust::detail::temporary_array<KeyType, DerivedPolicy>   temp1(exec, N);
   thrust::detail::temporary_array<ValueType, DerivedPolicy> temp2(exec, N);
 
@@ -579,5 +593,5 @@ void stable_radix_sort_by_key(sequential::execution_policy<DerivedPolicy> &exec,
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/trivial_copy.h b/thrust/system/detail/sequential/trivial_copy.h
index 77bf6dd42..ea55c8fd2 100644
--- a/thrust/system/detail/sequential/trivial_copy.h
+++ b/thrust/system/detail/sequential/trivial_copy.h
@@ -24,8 +24,9 @@
 #include <cstring>
 #include <thrust/system/detail/sequential/general_copy.h>
 
-namespace thrust
-{
+#include <nv/target>
+
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -40,17 +41,21 @@ __host__ __device__
                     std::ptrdiff_t n,
                     T *result)
 {
-#ifndef __CUDA_ARCH__
-  std::memmove(result, first, n * sizeof(T));
-  return result + n;
-#else
-  return thrust::system::detail::sequential::general_copy_n(first, n, result);
-#endif
+  T* return_value = NULL;
+
+  NV_IF_TARGET(NV_IS_HOST, (
+    std::memmove(result, first, n * sizeof(T));
+    return_value = result + n;
+  ), ( // NV_IS_DEVICE:
+    return_value = thrust::system::detail::sequential::general_copy_n(first, n, result);
+  ));
+
+  return return_value;
 } // end trivial_copy_n()
 
 
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/unique.h b/thrust/system/detail/sequential/unique.h
index d8b50d905..c4fe5268a 100644
--- a/thrust/system/detail/sequential/unique.h
+++ b/thrust/system/detail/sequential/unique.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -42,7 +41,7 @@ template<typename DerivedPolicy,
          typename OutputIterator,
          typename BinaryPredicate>
 __host__ __device__
-  OutputIterator unique_copy(sequential::execution_policy<DerivedPolicy> &exec,
+  OutputIterator unique_copy(sequential::execution_policy<DerivedPolicy> &,
                              InputIterator first,
                              InputIterator last,
                              OutputIterator output,
@@ -90,8 +89,42 @@ __host__ __device__
 } // end unique()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(sequential::execution_policy<DerivedPolicy> &,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  typedef typename thrust::iterator_traits<ForwardIterator>::value_type T;
+  typename thrust::iterator_traits<ForwardIterator>::difference_type count{};
+
+  if(first != last)
+  {
+    count++;
+    T prev = *first;
+
+    for(++first; first != last; ++first)
+    {
+      T temp = *first;
+
+      if (!binary_pred(prev, temp))
+      {
+        count++;
+        prev = temp;
+      }
+    }
+  }
+
+  return count;
+} // end unique()
+
+
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/sequential/unique_by_key.h b/thrust/system/detail/sequential/unique_by_key.h
index 899ce02db..d30cc7c71 100644
--- a/thrust/system/detail/sequential/unique_by_key.h
+++ b/thrust/system/detail/sequential/unique_by_key.h
@@ -26,8 +26,7 @@
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace detail
@@ -112,5 +111,5 @@ __host__ __device__
 } // end namespace sequential
 } // end namespace detail
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/detail/system_error.inl b/thrust/system/detail/system_error.inl
index 3e59458aa..075fe88e4 100644
--- a/thrust/system/detail/system_error.inl
+++ b/thrust/system/detail/system_error.inl
@@ -17,10 +17,11 @@
 
 #pragma once
 
+#include <thrust/detail/config.h>
+
 #include <thrust/system/system_error.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -75,14 +76,14 @@ system_error
 
 
 const error_code &system_error
-  ::code(void) const throw()
+  ::code(void) const noexcept
 {
   return m_error_code;
 } // end system_error::code()
 
 
 const char *system_error
-  ::what(void) const throw()
+  ::what(void) const noexcept
 {
   if(m_what.empty())
   {
@@ -107,5 +108,5 @@ const char *system_error
 
 } // end system
 
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/error_code.h b/thrust/system/error_code.h
index f6222277b..d460a315b 100644
--- a/thrust/system/error_code.h
+++ b/thrust/system/error_code.h
@@ -27,8 +27,7 @@
 #include <thrust/system/detail/errno.h>
 #include <iostream>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -54,6 +53,8 @@ template<typename T> struct is_error_condition_enum : public thrust::detail::fal
 namespace errc
 {
 
+/*! An enum containing common error codes.
+ */
 enum errc_t
 {
   address_family_not_supported       = detail::eafnosupport,
@@ -513,7 +514,7 @@ namespace errc = system::errc;
 using system::generic_category;
 using system::system_category;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/error_category.inl>
 #include <thrust/system/detail/error_code.inl>
diff --git a/thrust/system/omp/detail/adjacent_difference.h b/thrust/system/omp/detail/adjacent_difference.h
index 7f314eaeb..622ee61ba 100644
--- a/thrust/system/omp/detail/adjacent_difference.h
+++ b/thrust/system/omp/detail/adjacent_difference.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -46,5 +45,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/binary_search.h b/thrust/system/omp/detail/binary_search.h
index 37ff8fab5..1ed700bd8 100644
--- a/thrust/system/omp/detail/binary_search.h
+++ b/thrust/system/omp/detail/binary_search.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/binary_search.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -69,5 +68,5 @@ bool binary_search(execution_policy<DerivedPolicy> &exec,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/copy.h b/thrust/system/omp/detail/copy.h
index e2b6661e8..ae7b1eed7 100644
--- a/thrust/system/omp/detail/copy.h
+++ b/thrust/system/omp/detail/copy.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -51,7 +50,7 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/copy.inl>
 
diff --git a/thrust/system/omp/detail/copy.inl b/thrust/system/omp/detail/copy.inl
index 4d104e5ec..47f606dda 100644
--- a/thrust/system/omp/detail/copy.inl
+++ b/thrust/system/omp/detail/copy.inl
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits/minimum_type.h>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -133,5 +132,5 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/copy_if.h b/thrust/system/omp/detail/copy_if.h
index a5c28704d..b33fd96df 100644
--- a/thrust/system/omp/detail/copy_if.h
+++ b/thrust/system/omp/detail/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -45,7 +44,7 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/copy_if.inl>
 
diff --git a/thrust/system/omp/detail/copy_if.inl b/thrust/system/omp/detail/copy_if.inl
index 7f2516a74..8e597d4fc 100644
--- a/thrust/system/omp/detail/copy_if.inl
+++ b/thrust/system/omp/detail/copy_if.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/copy_if.h>
 #include <thrust/system/detail/generic/copy_if.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -50,5 +49,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/default_decomposition.h b/thrust/system/omp/detail/default_decomposition.h
index cb4b03c71..2fe0a24fd 100644
--- a/thrust/system/omp/detail/default_decomposition.h
+++ b/thrust/system/omp/detail/default_decomposition.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/detail/internal/decompose.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -39,7 +38,7 @@ thrust::system::detail::internal::uniform_decomposition<IndexType> default_decom
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/default_decomposition.inl>
 
diff --git a/thrust/system/omp/detail/default_decomposition.inl b/thrust/system/omp/detail/default_decomposition.inl
index 75b690ebb..0698d53fb 100644
--- a/thrust/system/omp/detail/default_decomposition.inl
+++ b/thrust/system/omp/detail/default_decomposition.inl
@@ -14,6 +14,8 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/default_decomposition.h>
 
@@ -22,8 +24,7 @@
 #include <omp.h>
 #endif // omp support
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -39,8 +40,12 @@ thrust::system::detail::internal::uniform_decomposition<IndexType> default_decom
   // X Note to the user: If you've found this line due to a compiler error, X
   // X you need to OpenMP support in your compiler.                         X
   // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<IndexType,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      IndexType, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
 
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   return thrust::system::detail::internal::uniform_decomposition<IndexType>(n, 1, omp_get_num_procs());
@@ -52,5 +57,5 @@ thrust::system::detail::internal::uniform_decomposition<IndexType> default_decom
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/execution_policy.h b/thrust/system/omp/detail/execution_policy.h
index 1696e3e0b..f9b45312b 100644
--- a/thrust/system/omp/detail/execution_policy.h
+++ b/thrust/system/omp/detail/execution_policy.h
@@ -22,8 +22,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -59,11 +58,8 @@ template<typename Derived>
   struct execution_policy
     : thrust::system::cpp::detail::execution_policy<Derived>
 {
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
 };
 
 
@@ -106,5 +102,5 @@ using thrust::system::omp::execution_policy;
 using thrust::system::omp::tag;
 
 } // end omp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/extrema.h b/thrust/system/omp/detail/extrema.h
index 96661180d..bde4e5f80 100644
--- a/thrust/system/omp/detail/extrema.h
+++ b/thrust/system/omp/detail/extrema.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/system/detail/generic/extrema.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -62,6 +61,6 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<De
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/omp/detail/find.h b/thrust/system/omp/detail/find.h
index e6445c068..d2abac95e 100644
--- a/thrust/system/omp/detail/find.h
+++ b/thrust/system/omp/detail/find.h
@@ -25,8 +25,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -47,5 +46,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/for_each.h b/thrust/system/omp/detail/for_each.h
index 4e6955ea2..a2030f374 100644
--- a/thrust/system/omp/detail/for_each.h
+++ b/thrust/system/omp/detail/for_each.h
@@ -25,8 +25,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -54,7 +53,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/for_each.inl>
 
diff --git a/thrust/system/omp/detail/for_each.inl b/thrust/system/omp/detail/for_each.inl
index 435137a48..4246d5380 100644
--- a/thrust/system/omp/detail/for_each.inl
+++ b/thrust/system/omp/detail/for_each.inl
@@ -14,21 +14,17 @@
  *  limitations under the License.
  */
 
-
-/*! \file for_each.inl
- *  \brief Inline file for for_each.h.
- */
+#pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/detail/static_assert.h>
-#include <thrust/distance.h>
 #include <thrust/detail/function.h>
-#include <thrust/iterator/iterator_traits.h>
+#include <thrust/detail/static_assert.h>
 #include <thrust/distance.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/pragma_omp.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -50,22 +46,23 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
   // X Note to the user: If you've found this line due to a compiler error, X
   // X you need to enable OpenMP support in your compiler.                  X
   // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
 
   if (n <= 0) return first;  //empty range
 
   // create a wrapped function for f
   thrust::detail::wrapped_function<UnaryFunction,void> wrapped_f(f);
 
-// do not attempt to compile the body of this function, which depends on #pragma omp,
-// without support from the compiler
-// XXX implement the body of this function in another file to eliminate this ugliness
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   // use a signed type for the iteration variable or suffer the consequences of warnings
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type DifferenceType;
   DifferenceType signed_n = n;
-#pragma omp parallel for
+
+  THRUST_PRAGMA_OMP(parallel for)
   for(DifferenceType i = 0;
       i < signed_n;
       ++i)
@@ -73,10 +70,9 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
     RandomAccessIterator temp = first + i;
     wrapped_f(*temp);
   }
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
 
   return first + n;
-} // end for_each_n() 
+} // end for_each_n()
 
 template<typename DerivedPolicy,
          typename RandomAccessIterator,
@@ -92,5 +88,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/memory.inl b/thrust/system/omp/detail/memory.inl
index 00225addb..db9b4f07b 100644
--- a/thrust/system/omp/detail/memory.inl
+++ b/thrust/system/omp/detail/memory.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,44 +14,20 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/system/omp/memory.h>
 #include <thrust/system/cpp/memory.h>
+
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
 {
-
-
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
 namespace detail
 {
 
@@ -106,5 +82,5 @@ inline void free(pointer<void> ptr)
 
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/par.h b/thrust/system/omp/detail/par.h
index abc6c2f23..b81a5d489 100644
--- a/thrust/system/omp/detail/par.h
+++ b/thrust/system/omp/detail/par.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,11 +17,10 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/omp/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -30,16 +29,12 @@ namespace detail
 {
 
 
-struct par_t : thrust::system::omp::detail::execution_policy<par_t>
+struct par_t : thrust::system::omp::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::omp::detail::execution_policy>
 {
-  par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::omp::detail::execution_policy>(alloc);
-  }
+  __host__ __device__
+  constexpr par_t() : thrust::system::omp::detail::execution_policy<par_t>() {}
 };
 
 
@@ -62,5 +57,5 @@ using thrust::system::omp::par;
 
 
 } // end omp
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/partition.h b/thrust/system/omp/detail/partition.h
index 64a76e278..7a6f4a934 100644
--- a/thrust/system/omp/detail/partition.h
+++ b/thrust/system/omp/detail/partition.h
@@ -25,8 +25,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -85,7 +84,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/partition.inl>
 
diff --git a/thrust/system/omp/detail/partition.inl b/thrust/system/omp/detail/partition.inl
index b81c17cbf..ba0a09eaf 100644
--- a/thrust/system/omp/detail/partition.inl
+++ b/thrust/system/omp/detail/partition.inl
@@ -25,8 +25,7 @@
 #include <thrust/system/omp/detail/partition.h>
 #include <thrust/system/detail/generic/partition.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -104,5 +103,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/per_device_resource.h b/thrust/system/omp/detail/per_device_resource.h
new file mode 100644
index 000000000..1b8d61f92
--- /dev/null
+++ b/thrust/system/omp/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/system/omp/detail/pragma_omp.h b/thrust/system/omp/detail/pragma_omp.h
new file mode 100644
index 000000000..a8eeae234
--- /dev/null
+++ b/thrust/system/omp/detail/pragma_omp.h
@@ -0,0 +1,56 @@
+/******************************************************************************
+* Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above copyright
+*       notice, this list of conditions and the following disclaimer in the
+*       documentation and/or other materials provided with the distribution.
+*     * Neither the name of the NVIDIA CORPORATION nor the
+*       names of its contributors may be used to endorse or promote products
+*       derived from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*
+******************************************************************************/
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC
+// MSVC ICEs when using the standard C++11 `_Pragma` operator with OpenMP
+// directives.
+// WAR this by using the MSVC-extension `__pragma`. See this link for more info:
+// https://developercommunity.visualstudio.com/t/Using-C11s-_Pragma-with-OpenMP-dire/1590628
+#define THRUST_PRAGMA_OMP_IMPL(directive) __pragma(directive)
+#else // Not MSVC:
+#define THRUST_PRAGMA_OMP_IMPL(directive) _Pragma(#directive)
+#endif
+
+// For internal use only -- THRUST_PRAGMA_OMP is used to switch between
+// different flavors of openmp pragmas. Pragmas are not emitted when OpenMP is
+// not available.
+//
+// Usage:
+//   Replace: #pragma omp parallel for
+//   With   : THRUST_PRAGMA_OMP(parallel for)
+//
+#if defined(_NVHPC_STDPAR_OPENMP) && _NVHPC_STDPAR_OPENMP == 1
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp_stdpar directive)
+#elif defined(_OPENMP)
+#define THRUST_PRAGMA_OMP(directive) THRUST_PRAGMA_OMP_IMPL(omp directive)
+#else
+#define THRUST_PRAGMA_OMP(directive)
+#endif
diff --git a/thrust/system/omp/detail/reduce.h b/thrust/system/omp/detail/reduce.h
index c058e05db..5e5f2106e 100644
--- a/thrust/system/omp/detail/reduce.h
+++ b/thrust/system/omp/detail/reduce.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce.inl>
 
diff --git a/thrust/system/omp/detail/reduce.inl b/thrust/system/omp/detail/reduce.inl
index 4609922a9..6a5723780 100644
--- a/thrust/system/omp/detail/reduce.inl
+++ b/thrust/system/omp/detail/reduce.inl
@@ -14,14 +14,15 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/system/omp/detail/reduce.h>
 #include <thrust/system/omp/detail/default_decomposition.h>
 #include <thrust/system/omp/detail/reduce_intervals.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -31,7 +32,7 @@ namespace detail
 
 
 template<typename DerivedPolicy,
-         typename InputIterator, 
+         typename InputIterator,
          typename OutputType,
          typename BinaryFunction>
   OutputType reduce(execution_policy<DerivedPolicy> &exec,
@@ -51,10 +52,10 @@ template<typename DerivedPolicy,
   // allocate storage for the initializer and partial sums
   // XXX use select_system for Tag
   thrust::detail::temporary_array<OutputType,DerivedPolicy> partial_sums(exec, decomp1.size() + 1);
-  
+
   // set first element of temp array to init
   partial_sums[0] = init;
-  
+
   // accumulate partial sums (first level reduction)
   thrust::system::omp::detail::reduce_intervals(exec, first, partial_sums.begin() + 1, binary_op, decomp1);
 
@@ -68,5 +69,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/reduce_by_key.h b/thrust/system/omp/detail/reduce_by_key.h
index 37e89ecba..005616de5 100644
--- a/thrust/system/omp/detail/reduce_by_key.h
+++ b/thrust/system/omp/detail/reduce_by_key.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -55,7 +54,7 @@ template <typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce_by_key.inl>
 
diff --git a/thrust/system/omp/detail/reduce_by_key.inl b/thrust/system/omp/detail/reduce_by_key.inl
index afd4c8e51..4088d0634 100644
--- a/thrust/system/omp/detail/reduce_by_key.inl
+++ b/thrust/system/omp/detail/reduce_by_key.inl
@@ -14,13 +14,14 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/reduce_by_key.h>
 #include <thrust/system/detail/generic/reduce_by_key.h>
 #include <thrust/distance.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -37,7 +38,7 @@ template <typename DerivedPolicy,
           typename BinaryFunction>
   thrust::pair<OutputIterator1,OutputIterator2>
     reduce_by_key(execution_policy<DerivedPolicy> &exec,
-                  InputIterator1 keys_first, 
+                  InputIterator1 keys_first,
                   InputIterator1 keys_last,
                   InputIterator2 values_first,
                   OutputIterator1 keys_output,
@@ -53,5 +54,5 @@ template <typename DerivedPolicy,
 } // end detail
 } // end omp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/reduce_intervals.h b/thrust/system/omp/detail/reduce_intervals.h
index 44551e645..1c69fc621 100644
--- a/thrust/system/omp/detail/reduce_intervals.h
+++ b/thrust/system/omp/detail/reduce_intervals.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -47,7 +46,7 @@ void reduce_intervals(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/reduce_intervals.inl>
 
diff --git a/thrust/system/omp/detail/reduce_intervals.inl b/thrust/system/omp/detail/reduce_intervals.inl
index ecce10c50..2668a7b60 100644
--- a/thrust/system/omp/detail/reduce_intervals.inl
+++ b/thrust/system/omp/detail/reduce_intervals.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/reduce_intervals.h>
@@ -21,8 +22,7 @@
 #include <thrust/detail/function.h>
 #include <thrust/detail/cstdint.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -46,8 +46,12 @@ void reduce_intervals(execution_policy<DerivedPolicy> &,
   // X Note to the user: If you've found this line due to a compiler error, X
   // X you need to enable OpenMP support in your compiler.                  X
   // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<InputIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      InputIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
 
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_value<OutputIterator>::type OutputType;
@@ -59,9 +63,7 @@ void reduce_intervals(execution_policy<DerivedPolicy> &,
 
   index_type n = static_cast<index_type>(decomp.size());
 
-#if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
-# pragma omp parallel for
-#endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
+  THRUST_PRAGMA_OMP(parallel for)
   for(index_type i = 0; i < n; i++)
   {
     InputIterator begin = input + decomp[i].begin();
@@ -89,5 +91,5 @@ void reduce_intervals(execution_policy<DerivedPolicy> &,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/remove.h b/thrust/system/omp/detail/remove.h
index ca4eab845..9b2d46e75 100644
--- a/thrust/system/omp/detail/remove.h
+++ b/thrust/system/omp/detail/remove.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -75,7 +74,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/remove.inl>
 
diff --git a/thrust/system/omp/detail/remove.inl b/thrust/system/omp/detail/remove.inl
index aa8289476..5330f1407 100644
--- a/thrust/system/omp/detail/remove.inl
+++ b/thrust/system/omp/detail/remove.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/remove.h>
 #include <thrust/system/detail/generic/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -90,5 +89,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/sort.h b/thrust/system/omp/detail/sort.h
index 339ce5b6e..cf0b8c6d6 100644
--- a/thrust/system/omp/detail/sort.h
+++ b/thrust/system/omp/detail/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -49,7 +48,7 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/sort.inl>
 
diff --git a/thrust/system/omp/detail/sort.inl b/thrust/system/omp/detail/sort.inl
index 7c7c33e78..a0867ca4d 100644
--- a/thrust/system/omp/detail/sort.inl
+++ b/thrust/system/omp/detail/sort.inl
@@ -14,6 +14,7 @@
  *  limitations under the License.
  */
 
+#pragma once
 
 #include <thrust/detail/config.h>
 
@@ -23,14 +24,14 @@
 #endif // omp support
 
 #include <thrust/iterator/iterator_traits.h>
+#include <thrust/system/omp/detail/default_decomposition.h>
 #include <thrust/system/detail/generic/select_system.h>
 #include <thrust/sort.h>
 #include <thrust/merge.h>
 #include <thrust/detail/seq.h>
 #include <thrust/detail/temporary_array.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -106,16 +107,21 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
   // X Note to the user: If you've found this line due to a compiler error, X
   // X you need to enable OpenMP support in your compiler.                  X
   // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator>::type IndexType;
-  
+
   if(first == last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(last - first, 1, omp_get_num_threads());
 
@@ -131,7 +137,10 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
                           comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
+
+    // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
+    ;
 
     IndexType nseg = decomp.size();
     IndexType h = 2;
@@ -159,7 +168,7 @@ void stable_sort(execution_policy<DerivedPolicy> &exec,
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
@@ -181,16 +190,21 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
   // X Note to the user: If you've found this line due to a compiler error, X
   // X you need to enable OpenMP support in your compiler.                  X
   // ========================================================================
-  THRUST_STATIC_ASSERT( (thrust::detail::depend_on_instantiation<RandomAccessIterator1,
-                        (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)>::value) );
-
+  THRUST_STATIC_ASSERT_MSG(
+    (thrust::detail::depend_on_instantiation<
+      RandomAccessIterator1, (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
+    >::value)
+  , "OpenMP compiler support is not enabled"
+  );
+
+  // Avoid issues on compilers that don't provide `omp_get_num_threads()`.
 #if (THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE == THRUST_TRUE)
   typedef typename thrust::iterator_difference<RandomAccessIterator1>::type IndexType;
-  
+
   if(keys_first == keys_last)
     return;
 
-  #pragma omp parallel
+  THRUST_PRAGMA_OMP(parallel)
   {
     thrust::system::detail::internal::uniform_decomposition<IndexType> decomp(keys_last - keys_first, 1, omp_get_num_threads());
 
@@ -207,7 +221,10 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
                                  comp);
     }
 
-    #pragma omp barrier
+    THRUST_PRAGMA_OMP(barrier)
+
+    // XXX For some reason, MSVC 2015 yields an error unless we include this meaningless semicolon here
+    ;
 
     IndexType nseg = decomp.size();
     IndexType h = 2;
@@ -236,7 +253,7 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
       nseg = (nseg + 1) / 2;
       h *= 2;
 
-      #pragma omp barrier
+      THRUST_PRAGMA_OMP(barrier)
     }
   }
 #endif // THRUST_DEVICE_COMPILER_IS_OMP_CAPABLE
@@ -246,5 +263,5 @@ void stable_sort_by_key(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/unique.h b/thrust/system/omp/detail/unique.h
index 433e7689b..cf8025665 100644
--- a/thrust/system/omp/detail/unique.h
+++ b/thrust/system/omp/detail/unique.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -50,10 +49,20 @@ template<typename DerivedPolicy,
                              BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/unique.inl>
 
diff --git a/thrust/system/omp/detail/unique.inl b/thrust/system/omp/detail/unique.inl
index 70f026dbb..9a93fb135 100644
--- a/thrust/system/omp/detail/unique.inl
+++ b/thrust/system/omp/detail/unique.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -59,8 +58,22 @@ template<typename DerivedPolicy,
 } // end unique_copy()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  // omp prefers generic::unique_count to cpp::unique_count
+  return thrust::system::detail::generic::unique_count(exec,first,last,binary_pred);
+} // end unique_count()
+
+
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/unique_by_key.h b/thrust/system/omp/detail/unique_by_key.h
index ff3acb094..43859b64e 100644
--- a/thrust/system/omp/detail/unique_by_key.h
+++ b/thrust/system/omp/detail/unique_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/omp/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -61,7 +60,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/unique_by_key.inl>
 
diff --git a/thrust/system/omp/detail/unique_by_key.inl b/thrust/system/omp/detail/unique_by_key.inl
index 0a4367b7b..6610c8a00 100644
--- a/thrust/system/omp/detail/unique_by_key.inl
+++ b/thrust/system/omp/detail/unique_by_key.inl
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique_by_key.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -70,5 +69,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/omp/detail/vector.inl b/thrust/system/omp/detail/vector.inl
deleted file mode 100644
index 55190f30d..000000000
--- a/thrust/system/omp/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/omp/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace omp
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end omp
-} // end system
-} // end thrust
-
diff --git a/thrust/system/omp/execution_policy.h b/thrust/system/omp/execution_policy.h
index e83289061..c027d6be6 100644
--- a/thrust/system/omp/execution_policy.h
+++ b/thrust/system/omp/execution_policy.h
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -129,7 +128,7 @@ struct tag : thrust::system::omp::execution_policy<tag> { unspecified };
  *    __host__ __device__
  *    void operator()(int x)
  *    {
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -150,7 +149,7 @@ static const unspecified par;
 
 } // end cpp
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/omp/memory.h b/thrust/system/omp/memory.h
index ba5646e85..31f407c4c 100644
--- a/thrust/system/omp/memory.h
+++ b/thrust/system/omp/memory.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -21,288 +21,15 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/omp/execution_policy.h>
+#include <thrust/system/omp/memory_resource.h>
 #include <thrust/memory.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
-namespace system
-{
-namespace omp
-{
-
-template<typename> class pointer;
-
-} // end omp
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::omp::pointer<Element> >
-{
-  private:
-    typedef thrust::system::omp::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
-namespace system
-{
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::omp
- *  \brief \p thrust::system::omp is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's OpenMP backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::omp</tt>
- *         namespace for easy access.
- *
- */
-namespace omp
-{
-
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::omp::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the omp system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in omp memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p omp::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see omp::malloc
- *  \see omp::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::omp::tag,
-               thrust::system::omp::reference<T>,
-               thrust::system::omp::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::omp::tag,
-      //thrust::system::omp::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::omp::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that omp::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p omp system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::omp::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p omp system.
- *  \p reference is the type of the result of dereferencing a \p omp::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::omp::pointer<T>,
-               thrust::system::omp::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::omp::pointer<T>,
-      thrust::system::omp::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
 
 /*! Allocates an area of memory available to Thrust's <tt>omp</tt> system.
  *  \param n Number of bytes to allocate.
@@ -337,78 +64,38 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p omp::allocator is the default allocator used by the \p omp system's containers such as
- *  <tt>omp::vector</tt> if no user-specified allocator is provided. \p omp::allocator allocates
- *  (deallocates) storage with \p omp::malloc (\p omp::free).
+/*! \p omp::allocator is the default allocator used by the \p omp system's
+ *  containers such as <tt>omp::vector</tt> if no user-specified allocator is
+ *  provided. \p omp::allocator allocates (deallocates) storage with \p
+ *  omp::malloc (\p omp::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::omp::memory_resource
+>;
 
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end omp
-
-/*! \}
+/*! \p omp::universal_allocator allocates memory that can be used by the \p omp
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::omp::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::omp
 
 /*! \namespace thrust::omp
  *  \brief \p thrust::omp is a top-level alias for thrust::system::omp.
  */
 namespace omp
 {
-
-using thrust::system::omp::pointer;
-using thrust::system::omp::reference;
 using thrust::system::omp::malloc;
 using thrust::system::omp::free;
 using thrust::system::omp::allocator;
+using thrust::system::omp::universal_allocator;
+} // namespace omp
 
-} // end omp
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/omp/detail/memory.inl>
 
diff --git a/thrust/system/omp/memory_resource.h b/thrust/system/omp/memory_resource.h
new file mode 100644
index 000000000..d8eed0c0f
--- /dev/null
+++ b/thrust/system/omp/memory_resource.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file omp/memory_resource.h
+ *  \brief Memory resources for the OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/omp/pointer.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::omp::pointer<void>
+    > native_resource;
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::omp::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! The memory resource for the OpenMP system. Uses \p mr::new_delete_resource
+ *  and tags it with \p omp::pointer.
+ */
+typedef detail::native_resource memory_resource;
+/*! The unified memory resource for the OpenMP system. Uses
+ *  \p mr::new_delete_resource and tags it with \p omp::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p omp::universal_memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \}
+ */
+
+}} // namespace system::omp
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/omp/pointer.h b/thrust/system/omp/pointer.h
new file mode 100644
index 000000000..2be42e4fc
--- /dev/null
+++ b/thrust/system/omp/pointer.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/omp/memory.h
+ *  \brief Managing memory associated with Thrust's OpenMP system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <type_traits>
+#include <thrust/system/omp/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
+{
+
+/*! \p omp::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p omp system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p omp memory.
+ *
+ *  \p omp::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p omp::pointer can be created with the function \p omp::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p omp::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
+ *
+ *  \note \p omp::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p omp::pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see omp::malloc
+ *  \see omp::free
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::omp::tag,
+  thrust::tagged_reference<T, thrust::system::omp::tag>
+>;
+
+/*! \p omp::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p omp system and host systems.
+ *
+ *  \p omp::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p omp::universal_pointer can be created with \p omp::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p omp::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p omp::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p omp::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see omp::universal_allocator
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::omp::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p omp system. \p reference is the type of the result of
+ *  dereferencing a \p omp::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template <typename T>
+using reference = thrust::tagged_reference<T, thrust::system::omp::tag>;
+
+}} // namespace system::omp
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::omp
+ *  \brief \p thrust::omp is a top-level alias for \p thrust::system::omp. */
+namespace omp
+{
+using thrust::system::omp::pointer;
+using thrust::system::omp::universal_pointer;
+using thrust::system::omp::reference;
+} // namespace omp
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/omp/vector.h b/thrust/system/omp/vector.h
index f0ef310d5..179b5207d 100644
--- a/thrust/system/omp/vector.h
+++ b/thrust/system/omp/vector.h
@@ -26,124 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace omp
 {
 
-// forward declaration of host_vector
-// XXX why is this here? it doesn't seem necessary for anything below
-template<typename T, typename Allocator> class host_vector;
-
-namespace system
-{
-namespace omp
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
-
 /*! \p omp::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p omp::vector may vary dynamically; memory management is
  *  automatic. The elements contained in an \p omp::vector reside in memory
- *  available to the \p omp system.
+ *  accessible by the \p omp system.
  *
  *  \tparam T The element type of the \p omp::vector.
- *  \tparam Allocator The allocator type of the \p omp::vector. Defaults to \p omp::allocator.
+ *  \tparam Allocator The allocator type of the \p omp::vector.
+ *          Defaults to \p omp::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p omp::vector
+ *                   shared by \p omp::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-    
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p omp::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p omp::vector with \p n default-constructed elements.
-     *  \param n The size of the \p omp::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p omp::vector with \p n copies of \p value.
-     *  \param n The size of the \p omp::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p omp::vector.
-     *  \param x The other \p omp::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates an \p omp::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::omp::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p omp::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p omp::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p omp::universal_vector reside in memory accessible by the \p omp system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p omp::universal_vector.
+ *  \tparam Allocator The allocator type of the \p omp::universal_vector.
+ *          Defaults to \p omp::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p omp::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::omp::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end omp
-} // end system
+}} // namespace system::omp
 
-// alias system::omp names at top-level
 namespace omp
 {
-
 using thrust::system::omp::vector;
+using thrust::system::omp::universal_vector;
+}
 
-} // end omp
-
-} // end thrust
-
-#include <thrust/system/omp/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/system_error.h b/thrust/system/system_error.h
index 84e453dc6..fb31a2da8 100644
--- a/thrust/system/system_error.h
+++ b/thrust/system/system_error.h
@@ -28,8 +28,7 @@
 
 #include <thrust/system/error_code.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 namespace system
 {
@@ -141,19 +140,19 @@ class system_error
 
     /*! Destructor does not throw.
      */
-    inline virtual ~system_error(void) throw () {};
+    inline virtual ~system_error(void) noexcept {};
     
     /*! Returns an object encoding the error.
      *  \return <tt>ec</tt> or <tt>error_code(ev, ecat)</tt>, from the
      *          constructor, as appropriate.
      */
-    inline const error_code &code(void) const throw();
+    inline const error_code &code(void) const noexcept;
 
     /*! Returns a human-readable string indicating the nature of the error.
      *  \return a string incorporating <tt>code().message()</tt> and the
      *          arguments supplied in the constructor.
      */
-    inline const char *what(void) const throw();
+    inline const char *what(void) const noexcept;
 
     /*! \cond
      */
@@ -173,7 +172,7 @@ class system_error
 // import names into thrust::
 using system::system_error;
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/detail/system_error.inl>
 
diff --git a/thrust/system/tbb/detail/adjacent_difference.h b/thrust/system/tbb/detail/adjacent_difference.h
index d22b4aac3..ab519d11e 100644
--- a/thrust/system/tbb/detail/adjacent_difference.h
+++ b/thrust/system/tbb/detail/adjacent_difference.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/system/detail/generic/adjacent_difference.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -46,5 +45,5 @@ template<typename DerivedPolicy,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/copy.h b/thrust/system/tbb/detail/copy.h
index 67c91ce10..30e95a98c 100644
--- a/thrust/system/tbb/detail/copy.h
+++ b/thrust/system/tbb/detail/copy.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -51,7 +50,7 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/copy.inl>
 
diff --git a/thrust/system/tbb/detail/copy.inl b/thrust/system/tbb/detail/copy.inl
index 7adf620d2..1016f40d4 100644
--- a/thrust/system/tbb/detail/copy.inl
+++ b/thrust/system/tbb/detail/copy.inl
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -23,8 +23,7 @@
 #include <thrust/detail/type_traits/minimum_type.h>
 #include <thrust/detail/copy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -133,5 +132,5 @@ OutputIterator copy_n(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/copy_if.h b/thrust/system/tbb/detail/copy_if.h
index 0420893ba..db860f377 100644
--- a/thrust/system/tbb/detail/copy_if.h
+++ b/thrust/system/tbb/detail/copy_if.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -44,7 +43,7 @@ template<typename InputIterator1,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/copy_if.inl>
 
diff --git a/thrust/system/tbb/detail/copy_if.inl b/thrust/system/tbb/detail/copy_if.inl
index 9c074a9fc..aa2379b8d 100644
--- a/thrust/system/tbb/detail/copy_if.inl
+++ b/thrust/system/tbb/detail/copy_if.inl
@@ -24,8 +24,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -127,5 +126,5 @@ template<typename InputIterator1,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/execution_policy.h b/thrust/system/tbb/detail/execution_policy.h
index 69ad0a45a..ac4a788e7 100644
--- a/thrust/system/tbb/detail/execution_policy.h
+++ b/thrust/system/tbb/detail/execution_policy.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -21,8 +21,7 @@
 #include <thrust/iterator/detail/any_system_tag.h>
 #include <thrust/detail/type_traits.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 // put the canonical tag in the same ns as the backend's entry points
@@ -58,11 +57,8 @@ template<typename Derived>
   struct execution_policy
     : thrust::system::cpp::detail::execution_policy<Derived>
 {
-  // allow conversion to tag
-  inline operator tag () const
-  {
-    return tag();
-  }
+  typedef tag tag_type; 
+  operator tag() const { return tag(); }
 };
 
 } // end detail
@@ -82,5 +78,5 @@ using thrust::system::tbb::execution_policy;
 using thrust::system::tbb::tag;
 
 } // end tbb
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/extrema.h b/thrust/system/tbb/detail/extrema.h
index 760c4ee5a..c6c747f42 100644
--- a/thrust/system/tbb/detail/extrema.h
+++ b/thrust/system/tbb/detail/extrema.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/system/detail/generic/extrema.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -62,6 +61,6 @@ thrust::pair<ForwardIterator,ForwardIterator> minmax_element(execution_policy<De
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 
diff --git a/thrust/system/tbb/detail/find.h b/thrust/system/tbb/detail/find.h
index e07d322a8..e5dea8e77 100644
--- a/thrust/system/tbb/detail/find.h
+++ b/thrust/system/tbb/detail/find.h
@@ -20,8 +20,7 @@
 #include <thrust/system/detail/generic/find.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -42,5 +41,5 @@ InputIterator find_if(execution_policy<DerivedPolicy> &exec,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/for_each.h b/thrust/system/tbb/detail/for_each.h
index a57a7d79d..26c4b539b 100644
--- a/thrust/system/tbb/detail/for_each.h
+++ b/thrust/system/tbb/detail/for_each.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/for_each.inl>
 
diff --git a/thrust/system/tbb/detail/for_each.inl b/thrust/system/tbb/detail/for_each.inl
index 4e665e735..21dfce9ae 100644
--- a/thrust/system/tbb/detail/for_each.inl
+++ b/thrust/system/tbb/detail/for_each.inl
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -14,17 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/static_assert.h>
 #include <thrust/distance.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 #include <thrust/system/detail/sequential/execution_policy.h>
+
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -78,7 +80,7 @@ RandomAccessIterator for_each_n(execution_policy<DerivedPolicy> &,
 
   // return the end of the range
   return first + n;
-} // end for_each_n 
+} // end for_each_n
 
 
 template<typename DerivedPolicy,
@@ -96,5 +98,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/memory.inl b/thrust/system/tbb/detail/memory.inl
index e221081c6..32e28300a 100644
--- a/thrust/system/tbb/detail/memory.inl
+++ b/thrust/system/tbb/detail/memory.inl
@@ -1,8 +1,8 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -14,44 +14,21 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/system/cpp/detail/execution_policy.h>
 #include <thrust/system/tbb/memory.h>
 #include <thrust/system/cpp/memory.h>
 #include <limits>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
 {
 
 
-template<typename T>
-  template<typename OtherT>
-    reference<T> &
-      reference<T>
-        ::operator=(const reference<OtherT> &other)
-{
-  return super_t::operator=(other);
-} // end reference::operator=()
-
-template<typename T>
-  reference<T> &
-    reference<T>
-      ::operator=(const value_type &x)
-{
-  return super_t::operator=(x);
-} // end reference::operator=()
-
-template<typename T>
-__host__ __device__
-void swap(reference<T> a, reference<T> b)
-{
-  a.swap(b);
-} // end swap()
-
 namespace detail
 {
 
@@ -106,5 +83,5 @@ inline void free(pointer<void> ptr)
 
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/merge.h b/thrust/system/tbb/detail/merge.h
index 44608959c..014e2eb8b 100644
--- a/thrust/system/tbb/detail/merge.h
+++ b/thrust/system/tbb/detail/merge.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -64,7 +63,7 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/merge.inl>
 
diff --git a/thrust/system/tbb/detail/merge.inl b/thrust/system/tbb/detail/merge.inl
index bcc728546..89a01aebf 100644
--- a/thrust/system/tbb/detail/merge.inl
+++ b/thrust/system/tbb/detail/merge.inl
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  *  limitations under the License.
  */
 
+#pragma once
+
+#include <thrust/detail/config.h>
+
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
@@ -22,8 +26,7 @@
 #include <thrust/detail/seq.h>
 #include <tbb/parallel_for.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -54,7 +57,7 @@ struct range
       first2(first2), last2(last2),
       result(result), comp(comp), grain_size(grain_size)
   {}
-  
+
   range(range& r, ::tbb::split)
     : first1(r.first1), last1(r.last1),
       first2(r.first2), last2(r.last2),
@@ -77,7 +80,7 @@ struct range
       mid2 += n2 / 2;
       mid1 = thrust::upper_bound(thrust::seq, first1, last1, raw_reference_cast(*mid2), comp);
     }
-    
+
     // set first range to [first1, mid1), [first2, mid2), result
     r.last1 = mid1;
     r.last2 = mid2;
@@ -150,7 +153,7 @@ struct range
       keys_result(keys_result), values_result(values_result),
       comp(comp), grain_size(grain_size)
   {}
-  
+
   range(range& r, ::tbb::split)
     : keys_first1(r.keys_first1), keys_last1(r.keys_last1),
       keys_first2(r.keys_first2), keys_last2(r.keys_last2),
@@ -176,12 +179,12 @@ struct range
       mid2 += n2 / 2;
       mid1 = thrust::upper_bound(thrust::seq, keys_first1, keys_last1, raw_reference_cast(*mid2), comp);
     }
-    
+
     // set first range to [keys_first1, mid1), [keys_first2, mid2), keys_result, values_result
     r.keys_last1 = mid1;
     r.keys_last2 = mid2;
 
-    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2) 
+    // set second range to [mid1, keys_last1), [mid2, keys_last2), keys_result + (mid1 - keys_first1) + (mid2 - keys_first2), values_result + (mid1 - keys_first1) + (mid2 - keys_first2)
     keys_first1 = mid1;
     keys_first2 = mid2;
     values_first1 += thrust::distance(r.keys_first1, mid1);
@@ -225,7 +228,7 @@ template<typename DerivedPolicy,
          typename InputIterator2,
          typename OutputIterator,
          typename StrictWeakOrdering>
-OutputIterator merge(execution_policy<DerivedPolicy> &exec,
+OutputIterator merge(execution_policy<DerivedPolicy> &,
                      InputIterator1 first1,
                      InputIterator1 last1,
                      InputIterator2 first2,
@@ -254,7 +257,7 @@ template <typename DerivedPolicy,
           typename OutputIterator2,
           typename StrictWeakOrdering>
 thrust::pair<OutputIterator1,OutputIterator2>
-  merge_by_key(execution_policy<DerivedPolicy> &exec,
+  merge_by_key(execution_policy<DerivedPolicy> &,
                InputIterator1 keys_first1,
                InputIterator1 keys_last1,
                InputIterator2 keys_first2,
@@ -282,5 +285,5 @@ thrust::pair<OutputIterator1,OutputIterator2>
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/par.h b/thrust/system/tbb/detail/par.h
index a571bfef2..308d41e13 100644
--- a/thrust/system/tbb/detail/par.h
+++ b/thrust/system/tbb/detail/par.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -17,11 +17,10 @@
 #pragma once
 
 #include <thrust/detail/config.h>
+#include <thrust/detail/allocator_aware_execution_policy.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
-#include <thrust/detail/execute_with_allocator.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -30,16 +29,12 @@ namespace detail
 {
 
 
-struct par_t : thrust::system::tbb::detail::execution_policy<par_t>
+struct par_t : thrust::system::tbb::detail::execution_policy<par_t>,
+  thrust::detail::allocator_aware_execution_policy<
+    thrust::system::tbb::detail::execution_policy>
 {
-  par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
-
-  template<typename Allocator>
-    thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>
-      operator()(Allocator &alloc) const
-  {
-    return thrust::detail::execute_with_allocator<Allocator, thrust::system::tbb::detail::execution_policy>(alloc);
-  }
+  __host__ __device__
+  constexpr par_t() : thrust::system::tbb::detail::execution_policy<par_t>() {}
 };
 
 
@@ -62,5 +57,5 @@ using thrust::system::tbb::par;
 
 
 } // end tbb
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/partition.h b/thrust/system/tbb/detail/partition.h
index 80323535c..f9c56b92b 100644
--- a/thrust/system/tbb/detail/partition.h
+++ b/thrust/system/tbb/detail/partition.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -81,7 +80,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/partition.inl>
 
diff --git a/thrust/system/tbb/detail/partition.inl b/thrust/system/tbb/detail/partition.inl
index 5085ed906..74ad809da 100644
--- a/thrust/system/tbb/detail/partition.inl
+++ b/thrust/system/tbb/detail/partition.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/partition.h>
 #include <thrust/system/detail/generic/partition.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -98,5 +97,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/per_device_resource.h b/thrust/system/tbb/detail/per_device_resource.h
new file mode 100644
index 000000000..1b8d61f92
--- /dev/null
+++ b/thrust/system/tbb/detail/per_device_resource.h
@@ -0,0 +1,22 @@
+/*
+ *  Copyright 2018 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// this system has no special per device resource functions
+
diff --git a/thrust/system/tbb/detail/reduce.h b/thrust/system/tbb/detail/reduce.h
index 7381da382..81e8d1f6f 100644
--- a/thrust/system/tbb/detail/reduce.h
+++ b/thrust/system/tbb/detail/reduce.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -48,7 +47,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/reduce.inl>
 
diff --git a/thrust/system/tbb/detail/reduce.inl b/thrust/system/tbb/detail/reduce.inl
index 22a13f63d..47fe6616d 100644
--- a/thrust/system/tbb/detail/reduce.inl
+++ b/thrust/system/tbb/detail/reduce.inl
@@ -26,8 +26,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_reduce.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -100,7 +99,7 @@ template<typename DerivedPolicy,
          typename InputIterator, 
          typename OutputType,
          typename BinaryFunction>
-  OutputType reduce(execution_policy<DerivedPolicy> &exec,
+  OutputType reduce(execution_policy<DerivedPolicy> &,
                     InputIterator begin,
                     InputIterator end,
                     OutputType init,
@@ -127,5 +126,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/reduce_by_key.h b/thrust/system/tbb/detail/reduce_by_key.h
index d8e3b38c5..04d46e7c0 100644
--- a/thrust/system/tbb/detail/reduce_by_key.h
+++ b/thrust/system/tbb/detail/reduce_by_key.h
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -51,7 +50,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/reduce_by_key.inl>
 
diff --git a/thrust/system/tbb/detail/reduce_by_key.inl b/thrust/system/tbb/detail/reduce_by_key.inl
index 92c0a2f8d..693abb2e7 100644
--- a/thrust/system/tbb/detail/reduce_by_key.inl
+++ b/thrust/system/tbb/detail/reduce_by_key.inl
@@ -27,12 +27,12 @@
 #include <thrust/detail/range/tail_flags.h>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
-#include <tbb/tbb_thread.h>
+
 #include <cassert>
+#include <thread>
 
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -81,7 +81,7 @@ template<typename InputIterator1,
   thrust::pair<
     InputIterator1,
     thrust::pair<
-      typename InputIterator1::value_type,
+      typename thrust::iterator_value<InputIterator1>::type,
       typename partial_sum_type<InputIterator2,BinaryFunction>::type
     >
   >
@@ -98,7 +98,7 @@ template<typename InputIterator1,
   thrust::reverse_iterator<InputIterator1> keys_last_r(keys_first);
   thrust::reverse_iterator<InputIterator2> values_first_r(values_first + n);
 
-  typename InputIterator1::value_type result_key = *keys_first_r;
+  typename thrust::iterator_value<InputIterator1>::type result_key = *keys_first_r;
   typename partial_sum_type<InputIterator2,BinaryFunction>::type result_value = *values_first_r;
 
   // consume the entirety of the first key's sequence
@@ -122,7 +122,7 @@ template<typename InputIterator1,
   thrust::tuple<
     OutputIterator1,
     OutputIterator2,
-    typename InputIterator1::value_type,
+    typename thrust::iterator_value<InputIterator1>::type,
     typename partial_sum_type<InputIterator2,BinaryFunction>::type
   >
     reduce_by_key_with_carry(InputIterator1 keys_first, 
@@ -136,7 +136,7 @@ template<typename InputIterator1,
   // first, consume the last sequence to produce the carry
   // XXX is there an elegant way to pose this such that we don't need to default construct carry?
   thrust::pair<
-    typename InputIterator1::value_type,
+    typename thrust::iterator_value<InputIterator1>::type,
     typename partial_sum_type<InputIterator2,BinaryFunction>::type
   > carry;
 
@@ -198,7 +198,7 @@ template<typename Iterator1, typename Iterator2, typename Iterator3, typename It
     const size_type interval_idx = r.begin();
 
     const size_type offset_to_first = interval_size * interval_idx;
-    const size_type offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    const size_type offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     Iterator1 my_keys_first     = keys_first    + offset_to_first;
     Iterator1 my_keys_last      = keys_first    + offset_to_last;
@@ -281,7 +281,7 @@ template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typenam
   }
 
   // count the number of processors
-  const unsigned int p = thrust::max<unsigned int>(1u, ::tbb::tbb_thread::hardware_concurrency());
+  const unsigned int p = thrust::max<unsigned int>(1u, std::thread::hardware_concurrency());
 
   // generate O(P) intervals of sequential work
   // XXX oversubscribing is a tuning opportunity
@@ -337,5 +337,5 @@ template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typenam
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/reduce_intervals.h b/thrust/system/tbb/detail/reduce_intervals.h
index 88fefe43d..7164c3f97 100644
--- a/thrust/system/tbb/detail/reduce_intervals.h
+++ b/thrust/system/tbb/detail/reduce_intervals.h
@@ -27,8 +27,7 @@
 #include <thrust/reduce.h>
 #include <cassert>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -65,7 +64,7 @@ template<typename RandomAccessIterator1, typename RandomAccessIterator2, typenam
     Size interval_idx = r.begin();
 
     Size offset_to_first = interval_size * interval_idx;
-    Size offset_to_last = thrust::min(n, offset_to_first + interval_size);
+    Size offset_to_last = (thrust::min)(n, offset_to_first + interval_size);
 
     RandomAccessIterator1 my_first = first + offset_to_first;
     RandomAccessIterator1 my_last  = first + offset_to_last;
@@ -121,5 +120,5 @@ template<typename DerivedPolicy, typename RandomAccessIterator1, typename Size,
 } // end detail
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/remove.h b/thrust/system/tbb/detail/remove.h
index 49f70588d..34cd91799 100644
--- a/thrust/system/tbb/detail/remove.h
+++ b/thrust/system/tbb/detail/remove.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/omp/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace omp
@@ -75,7 +74,7 @@ template<typename ExecutionPolicy,
 } // end namespace detail
 } // end namespace omp
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/remove.inl>
 
diff --git a/thrust/system/tbb/detail/remove.inl b/thrust/system/tbb/detail/remove.inl
index 0a937799d..76d77e64b 100644
--- a/thrust/system/tbb/detail/remove.inl
+++ b/thrust/system/tbb/detail/remove.inl
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/remove.h>
 #include <thrust/system/detail/generic/remove.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -90,5 +89,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/scan.h b/thrust/system/tbb/detail/scan.h
index 32a05a5a6..b31b46317 100644
--- a/thrust/system/tbb/detail/scan.h
+++ b/thrust/system/tbb/detail/scan.h
@@ -24,8 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -58,7 +57,7 @@ template<typename InputIterator,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/scan.inl>
 
diff --git a/thrust/system/tbb/detail/scan.inl b/thrust/system/tbb/detail/scan.inl
index d58022934..d6e894983 100644
--- a/thrust/system/tbb/detail/scan.inl
+++ b/thrust/system/tbb/detail/scan.inl
@@ -28,8 +28,7 @@
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_scan.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -104,7 +103,12 @@ struct inclusive_body
 
   void reverse_join(inclusive_body& b)
   {
-    sum = binary_op(b.sum, sum);
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
   } 
 
   void assign(inclusive_body& b)
@@ -172,8 +176,13 @@ struct exclusive_body
 
   void reverse_join(exclusive_body& b)
   {
-    sum = binary_op(b.sum, sum);
-  } 
+    // Only accumulate this functor's partial sum if this functor has been
+    // called at least once -- otherwise we'll over-count the initial value.
+    if (!first_call)
+    {
+      sum = binary_op(b.sum, sum);
+    }
+  }
 
   void assign(exclusive_body& b)
   {
@@ -183,8 +192,6 @@ struct exclusive_body
 
 } // end scan_detail
 
-
-
 template<typename InputIterator,
          typename OutputIterator,
          typename BinaryFunction>
@@ -194,32 +201,12 @@ template<typename InputIterator,
                                 OutputIterator result,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-  
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-  
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
+  // Use the input iterator's value type per https://wg21.link/P0571
+  using ValueType = typename thrust::iterator_value<InputIterator>::type;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
   Size n = thrust::distance(first, last);
 
   if (n != 0)
@@ -228,50 +215,29 @@ template<typename InputIterator,
     Body scan_body(first, result, binary_op, *first);
     ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
   }
- 
+
   thrust::advance(result, n);
 
   return result;
 }
 
-
 template<typename InputIterator,
          typename OutputIterator,
-         typename T,
+         typename InitialValueType,
          typename BinaryFunction>
   OutputIterator exclusive_scan(tag,
                                 InputIterator first,
                                 InputIterator last,
                                 OutputIterator result,
-                                T init,
+                                InitialValueType init,
                                 BinaryFunction binary_op)
 {
-  // the pseudocode for deducing the type of the temporary used below:
-  // 
-  // if BinaryFunction is AdaptableBinaryFunction
-  //   TemporaryType = AdaptableBinaryFunction::result_type
-  // else if OutputIterator is a "pure" output iterator
-  //   TemporaryType = InputIterator::value_type
-  // else
-  //   TemporaryType = OutputIterator::value_type
-  //
-  // XXX upon c++0x, TemporaryType needs to be:
-  // result_of<BinaryFunction>::type
-
   using namespace thrust::detail;
 
-  typedef typename eval_if<
-    has_result_type<BinaryFunction>::value,
-    result_type<BinaryFunction>,
-    eval_if<
-      is_output_iterator<OutputIterator>::value,
-      thrust::iterator_value<InputIterator>,
-      thrust::iterator_value<OutputIterator>
-    >
-  >::type ValueType;
-
-  typedef typename thrust::iterator_difference<InputIterator>::type Size; 
-  
+  // Use the initial value type per https://wg21.link/P0571
+  using ValueType = InitialValueType;
+
+  using Size = typename thrust::iterator_difference<InputIterator>::type;
   Size n = thrust::distance(first, last);
 
   if (n != 0)
@@ -280,7 +246,7 @@ template<typename InputIterator,
     Body scan_body(first, result, binary_op, init);
     ::tbb::parallel_scan(::tbb::blocked_range<Size>(0,n), scan_body);
   }
- 
+
   thrust::advance(result, n);
 
   return result;
@@ -289,5 +255,4 @@ template<typename InputIterator,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system/tbb/detail/sort.h b/thrust/system/tbb/detail/sort.h
index 863189a1e..9c58bf6d4 100644
--- a/thrust/system/tbb/detail/sort.h
+++ b/thrust/system/tbb/detail/sort.h
@@ -19,8 +19,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/system/tbb/detail/execution_policy.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -49,7 +48,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/sort.inl>
 
diff --git a/thrust/system/tbb/detail/sort.inl b/thrust/system/tbb/detail/sort.inl
index ec3b34cf1..103710fba 100644
--- a/thrust/system/tbb/detail/sort.inl
+++ b/thrust/system/tbb/detail/sort.inl
@@ -14,17 +14,19 @@
  *  limitations under the License.
  */
 
+#pragma once
+
 #include <thrust/detail/config.h>
 #include <thrust/detail/temporary_array.h>
 #include <thrust/detail/copy.h>
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/distance.h>
 #include <thrust/merge.h>
+#include <thrust/sort.h>
 #include <thrust/detail/seq.h>
 #include <tbb/parallel_invoke.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -38,7 +40,7 @@ namespace sort_detail
 // TODO tune this based on data type and comp
 const static int threshold = 128 * 1024;
 
-  
+
 template<typename DerivedPolicy, typename Iterator1, typename Iterator2, typename StrictWeakOrdering>
 void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterator1 last1, Iterator2 first2, StrictWeakOrdering comp, bool inplace);
 
@@ -73,7 +75,7 @@ void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterato
   if (n < threshold)
   {
     thrust::stable_sort(thrust::seq, first1, last1, comp);
-    
+
     if(!inplace)
     {
       thrust::copy(thrust::seq, first1, last1, first2);
@@ -87,7 +89,7 @@ void merge_sort(execution_policy<DerivedPolicy> &exec, Iterator1 first1, Iterato
   Iterator2 last2 = first2 + n;
 
   typedef merge_sort_closure<DerivedPolicy,Iterator1,Iterator2,StrictWeakOrdering> Closure;
-  
+
   Closure left (exec, first1, mid1,  first2, comp, !inplace);
   Closure right(exec, mid1,   last1, mid2,   comp, !inplace);
 
@@ -108,7 +110,7 @@ namespace sort_by_key_detail
 // TODO tune this based on data type and comp
 const static int threshold = 128 * 1024;
 
-  
+
 template<typename DerivedPolicy,
          typename Iterator1,
          typename Iterator2,
@@ -177,7 +179,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   typedef typename thrust::iterator_difference<Iterator1>::type difference_type;
 
   difference_type n = thrust::distance(first1, last1);
-  
+
   Iterator1 mid1  = first1 + (n / 2);
   Iterator2 mid2  = first2 + (n / 2);
   Iterator3 mid3  = first3 + (n / 2);
@@ -188,7 +190,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   if (n < threshold)
   {
     thrust::stable_sort_by_key(thrust::seq, first1, last1, first2, comp);
-    
+
     if(!inplace)
     {
       thrust::copy(thrust::seq, first1, last1, first3);
@@ -199,7 +201,7 @@ void merge_sort_by_key(execution_policy<DerivedPolicy> &exec,
   }
 
   typedef merge_sort_by_key_closure<DerivedPolicy,Iterator1,Iterator2,Iterator3,Iterator4,StrictWeakOrdering> Closure;
-  
+
   Closure left (exec, first1, mid1,  first2, first3, first4, comp, !inplace);
   Closure right(exec, mid1,   last1, mid2,   mid3,   mid4,   comp, !inplace);
 
@@ -260,5 +262,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/unique.h b/thrust/system/tbb/detail/unique.h
index 3d594fabd..843e6406e 100644
--- a/thrust/system/tbb/detail/unique.h
+++ b/thrust/system/tbb/detail/unique.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -50,10 +49,20 @@ template<typename ExecutionPolicy,
                              BinaryPredicate binary_pred);
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/unique.inl>
 
diff --git a/thrust/system/tbb/detail/unique.inl b/thrust/system/tbb/detail/unique.inl
index fb070ae47..136af897c 100644
--- a/thrust/system/tbb/detail/unique.inl
+++ b/thrust/system/tbb/detail/unique.inl
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -59,8 +58,22 @@ template<typename DerivedPolicy,
 } // end unique_copy()
 
 
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(execution_policy<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred)
+{
+  // tbb prefers generic::unique_count to cpp::unique_count
+  return thrust::system::detail::generic::unique_count(exec,first,last,binary_pred);
+} // end unique_count()
+
+
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/unique_by_key.h b/thrust/system/tbb/detail/unique_by_key.h
index 0cc4d7605..513bb386e 100644
--- a/thrust/system/tbb/detail/unique_by_key.h
+++ b/thrust/system/tbb/detail/unique_by_key.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -20,8 +20,7 @@
 #include <thrust/system/tbb/detail/execution_policy.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -61,7 +60,7 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb 
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/unique_by_key.inl>
 
diff --git a/thrust/system/tbb/detail/unique_by_key.inl b/thrust/system/tbb/detail/unique_by_key.inl
index e2bbade29..dbd5922b0 100644
--- a/thrust/system/tbb/detail/unique_by_key.inl
+++ b/thrust/system/tbb/detail/unique_by_key.inl
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -21,8 +21,7 @@
 #include <thrust/system/detail/generic/unique_by_key.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -70,5 +69,5 @@ template<typename DerivedPolicy,
 } // end namespace detail
 } // end namespace tbb
 } // end namespace system
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
diff --git a/thrust/system/tbb/detail/vector.inl b/thrust/system/tbb/detail/vector.inl
deleted file mode 100644
index b323feda8..000000000
--- a/thrust/system/tbb/detail/vector.inl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- *  Copyright 2008-2013 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <thrust/detail/config.h>
-#include <thrust/system/tbb/vector.h>
-
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector()
-      : super_t()
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n)
-      : super_t(n)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(size_type n, const value_type &value)
-      : super_t(n,value)
-{}
-
-template<typename T, typename Allocator>
-  vector<T,Allocator>
-    ::vector(const vector &x)
-      : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator>
-      ::vector(const std::vector<OtherT,OtherAllocator> &x)
-        : super_t(x)
-{}
-
-template<typename T, typename Allocator>
-  template<typename InputIterator>
-    vector<T,Allocator>
-      ::vector(InputIterator first, InputIterator last)
-        : super_t(first,last)
-{}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const std::vector<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-
-template<typename T, typename Allocator>
-  template<typename OtherT, typename OtherAllocator>
-    vector<T,Allocator> &
-      vector<T,Allocator>
-        ::operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x)
-{
-  super_t::operator=(x);
-  return *this;
-}
-      
-} // end tbb
-} // end system
-} // end thrust
-
diff --git a/thrust/system/tbb/execution_policy.h b/thrust/system/tbb/execution_policy.h
index 2b7db0b43..bfa6b7893 100644
--- a/thrust/system/tbb/execution_policy.h
+++ b/thrust/system/tbb/execution_policy.h
@@ -76,8 +76,7 @@
 // define these entities here for the purpose of Doxygenating them
 // they are actually defined elsewhere
 #if 0
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
 namespace tbb
@@ -129,7 +128,7 @@ struct tag : thrust::system::tbb::execution_policy<tag> { unspecified };
  *    __host__ __device__
  *    void operator()(int x)
  *    {
- *      printf("%d\n");
+ *      printf("%d\n", x);
  *    }
  *  };
  *  ...
@@ -150,7 +149,7 @@ static const unspecified par;
 
 } // end tbb
 } // end system
-} // end thrust
+THRUST_NAMESPACE_END
 #endif
 
 
diff --git a/thrust/system/tbb/memory.h b/thrust/system/tbb/memory.h
index e40313cd2..3bd442232 100644
--- a/thrust/system/tbb/memory.h
+++ b/thrust/system/tbb/memory.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in ctbbliance with the License.
@@ -21,289 +21,18 @@
 #pragma once
 
 #include <thrust/detail/config.h>
-#include <thrust/system/tbb/execution_policy.h>
+#include <thrust/system/tbb/memory_resource.h>
 #include <thrust/memory.h>
 #include <thrust/detail/type_traits.h>
-#include <thrust/detail/allocator/malloc_allocator.h>
+#include <thrust/mr/allocator.h>
 #include <ostream>
 
-namespace thrust
-{
-namespace system
-{
-namespace tbb
-{
-
-template<typename> class pointer;
-
-} // end tbb
-} // end system
-} // end thrust
-
-
-/*! \cond
- */
-
-// specialize thrust::iterator_traits to avoid problems with the name of
-// pointer's constructor shadowing its nested pointer type
-// do this before pointer is defined so the specialization is correctly
-// used inside the definition
-namespace thrust
-{
-
-template<typename Element>
-  struct iterator_traits<thrust::system::tbb::pointer<Element> >
-{
-  private:
-    typedef thrust::system::tbb::pointer<Element> ptr;
-
-  public:
-    typedef typename ptr::iterator_category       iterator_category;
-    typedef typename ptr::value_type              value_type;
-    typedef typename ptr::difference_type         difference_type;
-    typedef ptr                                   pointer;
-    typedef typename ptr::reference               reference;
-}; // end iterator_traits
-
-} // end thrust
-
-/*! \endcond
- */
-
-
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 namespace system
 {
-
-/*! \addtogroup system_backends Systems
- *  \ingroup system
- *  \{
- */
-
-/*! \namespace thrust::system::tbb
- *  \brief \p thrust::system::tbb is the namespace containing functionality for allocating, manipulating,
- *         and deallocating memory available to Thrust's TBB backend system.
- *         The identifiers are provided in a separate namespace underneath <tt>thrust::system</tt>
- *         for import convenience but are also aliased in the top-level <tt>thrust::tbb</tt>
- *         namespace for easy access.
- *
- */
 namespace tbb
 {
 
-// forward declaration of reference for pointer
-template<typename Element> class reference;
-
-/*! \cond
- */
-
-// XXX nvcc + msvc have trouble instantiating reference below
-//     this is a workaround
-namespace detail
-{
-
-template<typename Element>
-  struct reference_msvc_workaround
-{
-  typedef thrust::system::tbb::reference<Element> type;
-}; // end reference_msvc_workaround
-
-} // end detail
-
-/*! \endcond
- */
-
-
-/*! \p pointer stores a pointer to an object allocated in memory available to the tbb system.
- *  This type provides type safety when dispatching standard algorithms on ranges resident
- *  in tbb memory.
- *
- *  \p pointer has pointer semantics: it may be dereferenced and manipulated with pointer arithmetic.
- *
- *  \p pointer can be created with the function \p tbb::malloc, or by explicitly calling its constructor
- *  with a raw pointer.
- *
- *  The raw pointer encapsulated by a \p pointer may be obtained by eiter its <tt>get</tt> member function
- *  or the \p raw_pointer_cast function.
- *
- *  \note \p pointer is not a "smart" pointer; it is the programmer's responsibility to deallocate memory
- *  pointed to by \p pointer.
- *
- *  \tparam T specifies the type of the pointee.
- *
- *  \see tbb::malloc
- *  \see tbb::free
- *  \see raw_pointer_cast
- */
-template<typename T>
-  class pointer
-    : public thrust::pointer<
-               T,
-               thrust::system::tbb::tag,
-               thrust::system::tbb::reference<T>,
-               thrust::system::tbb::pointer<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::pointer<
-      T,
-      thrust::system::tbb::tag,
-      //thrust::system::tbb::reference<T>,
-      typename detail::reference_msvc_workaround<T>::type,
-      thrust::system::tbb::pointer<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    // note that tbb::pointer's member functions need __host__ __device__
-    // to interoperate with nvcc + iterators' dereference member function
-
-    /*! \p pointer's no-argument constructor initializes its encapsulated pointer to \c 0.
-     */
-    __host__ __device__
-    pointer() : super_t() {}
-
-    /*! This constructor allows construction of a <tt>pointer<const T></tt> from a <tt>T*</tt>.
-     *
-     *  \param ptr A raw pointer to copy from, presumed to point to a location in memory
-     *         accessible by the \p tbb system.
-     *  \tparam OtherT \p OtherT shall be convertible to \p T.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    explicit pointer(OtherT *ptr) : super_t(ptr) {}
-
-    /*! This constructor allows construction from another pointer-like object with related type.
-     *
-     *  \param other The \p OtherPointer to copy.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    pointer(const OtherPointer &other,
-            typename thrust::detail::enable_if_pointer_is_convertible<
-              OtherPointer,
-              pointer
-            >::type * = 0) : super_t(other) {}
-
-    /*! Assignment operator allows assigning from another pointer-like object with related type.
-     *
-     *  \param other The other pointer-like object to assign from.
-     *  \tparam OtherPointer The system tag associated with \p OtherPointer shall be convertible
-     *          to \p thrust::system::tbb::tag and its element type shall be convertible to \p T.
-     */
-    template<typename OtherPointer>
-    __host__ __device__
-    typename thrust::detail::enable_if_pointer_is_convertible<
-      OtherPointer,
-      pointer,
-      pointer &
-    >::type
-    operator=(const OtherPointer &other)
-    {
-      return super_t::operator=(other);
-    }
-}; // end pointer
-
-
-/*! \p reference is a wrapped reference to an object stored in memory available to the \p tbb system.
- *  \p reference is the type of the result of dereferencing a \p tbb::pointer.
- *
- *  \tparam T Specifies the type of the referenced object.
- */
-template<typename T>
-  class reference
-    : public thrust::reference<
-               T,
-               thrust::system::tbb::pointer<T>,
-               thrust::system::tbb::reference<T>
-             >
-{
-  /*! \cond
-   */
-
-  private:
-    typedef thrust::reference<
-      T,
-      thrust::system::tbb::pointer<T>,
-      thrust::system::tbb::reference<T>
-    > super_t;
-
-  /*! \endcond
-   */
-
-  public:
-    /*! \cond
-     */
-
-    typedef typename super_t::value_type value_type;
-    typedef typename super_t::pointer    pointer;
-
-    /*! \endcond
-     */
-
-    /*! This constructor initializes this \p reference to refer to an object
-     *  pointed to by the given \p pointer. After this \p reference is constructed,
-     *  it shall refer to the object pointed to by \p ptr.
-     *
-     *  \param ptr A \p pointer to copy from.
-     */
-    __host__ __device__
-    explicit reference(const pointer &ptr)
-      : super_t(ptr)
-    {}
-
-    /*! This constructor accepts a const reference to another \p reference of related type.
-     *  After this \p reference is constructed, it shall refer to the same object as \p other.
-     *
-     *  \param other A \p reference to copy from.
-     *  \tparam OtherT The element type of the other \p reference.
-     *
-     *  \note This constructor is templated primarily to allow initialization of <tt>reference<const T></tt>
-     *        from <tt>reference<T></tt>.
-     */
-    template<typename OtherT>
-    __host__ __device__
-    reference(const reference<OtherT> &other,
-              typename thrust::detail::enable_if_convertible<
-                typename reference<OtherT>::pointer,
-                pointer
-              >::type * = 0)
-      : super_t(other)
-    {}
-
-    /*! Copy assignment operator copy assigns from another \p reference of related type.
-     *
-     *  \param other The other \p reference to assign from.
-     *  \return <tt>*this</tt>
-     *  \tparam OtherT The element type of the other \p reference.
-     */
-    template<typename OtherT>
-    reference &operator=(const reference<OtherT> &other);
-
-    /*! Assignment operator assigns from a \p value_type.
-     *
-     *  \param x The \p value_type to assign from.
-     *  \return <tt>*this</tt>
-     */
-    reference &operator=(const value_type &x);
-}; // end reference
-
-/*! Exchanges the values of two objects referred to by \p reference.
- *  \p x The first \p reference of interest.
- *  \p y The second \p reference ot interest.
- */
-template<typename T>
-__host__ __device__
-void swap(reference<T> x, reference<T> y);
-
 /*! Allocates an area of memory available to Thrust's <tt>tbb</tt> system.
  *  \param n Number of bytes to allocate.
  *  \return A <tt>tbb::pointer<void></tt> pointing to the beginning of the newly
@@ -337,78 +66,38 @@ inline pointer<T> malloc(std::size_t n);
  */
 inline void free(pointer<void> ptr);
 
-// XXX upon c++11
-// template<typename T> using allocator = thrust::detail::malloc_allocator<T,tag,pointer<T> >;
-
-/*! \p tbb::allocator is the default allocator used by the \p tbb system's containers such as
- *  <tt>tbb::vector</tt> if no user-specified allocator is provided. \p tbb::allocator allocates
- *  (deallocates) storage with \p tbb::malloc (\p tbb::free).
+/*! \p tbb::allocator is the default allocator used by the \p tbb system's
+ *  containers such as <tt>tbb::vector</tt> if no user-specified allocator is
+ *  provided. \p tbb::allocator allocates (deallocates) storage with \p
+ *  tbb::malloc (\p tbb::free).
  */
 template<typename T>
-  struct allocator
-    : thrust::detail::malloc_allocator<
-        T,
-        tag,
-        pointer<T>
-      >
-{
-  /*! The \p rebind metafunction provides the type of an \p allocator
-   *  instantiated with another type.
-   *
-   *  \tparam U The other type to use for instantiation.
-   */
-  template<typename U>
-    struct rebind
-  {
-    /*! The typedef \p other gives the type of the rebound \p allocator.
-     */
-    typedef allocator<U> other;
-  };
-
-  /*! No-argument constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator() {}
-
-  /*! Copy constructor has no effect.
-   */
-  __host__ __device__
-  inline allocator(const allocator &) {}
-
-  /*! Constructor from other \p allocator has no effect.
-   */
-  template<typename U>
-  __host__ __device__
-  inline allocator(const allocator<U> &) {}
+using allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::tbb::memory_resource
+>;
 
-  /*! Destructor has no effect.
-   */
-  __host__ __device__
-  inline ~allocator() {}
-}; // end allocator
-
-} // end tbb
-
-/*! \}
+/*! \p tbb::universal_allocator allocates memory that can be used by the \p tbb
+ *  system and host systems.
  */
+template<typename T>
+using universal_allocator = thrust::mr::stateless_resource_allocator<
+  T, thrust::system::tbb::universal_memory_resource
+>;
 
-} // end system
+}} // namespace system::tbb
 
 /*! \namespace thrust::tbb
  *  \brief \p thrust::tbb is a top-level alias for thrust::system::tbb.
  */
 namespace tbb
 {
-
-using thrust::system::tbb::pointer;
-using thrust::system::tbb::reference;
 using thrust::system::tbb::malloc;
 using thrust::system::tbb::free;
 using thrust::system::tbb::allocator;
+using thrust::system::tbb::universal_allocator;
+} // namsespace tbb
 
-} // end tbb
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/tbb/detail/memory.inl>
 
diff --git a/thrust/system/tbb/memory_resource.h b/thrust/system/tbb/memory_resource.h
new file mode 100644
index 000000000..a698b9242
--- /dev/null
+++ b/thrust/system/tbb/memory_resource.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2018-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file tbb/memory_resource.h
+ *  \brief Memory resources for the TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/mr/new.h>
+#include <thrust/mr/fancy_pointer_resource.h>
+
+#include <thrust/system/tbb/pointer.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
+{
+
+//! \cond
+namespace detail
+{
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::tbb::pointer<void>
+    > native_resource;
+
+    typedef thrust::mr::fancy_pointer_resource<
+        thrust::mr::new_delete_resource,
+        thrust::tbb::universal_pointer<void>
+    > universal_native_resource;
+} // namespace detail
+//! \endcond
+
+/*! \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management
+ *  \{
+ */
+
+/*! The memory resource for the TBB system. Uses \p mr::new_delete_resource and
+ *  tags it with \p tbb::pointer.
+ */
+typedef detail::native_resource memory_resource;
+/*! The unified memory resource for the TBB system. Uses
+ *  \p mr::new_delete_resource and tags it with \p tbb::universal_pointer.
+ */
+typedef detail::universal_native_resource universal_memory_resource;
+/*! An alias for \p tbb::universal_memory_resource. */
+typedef detail::native_resource universal_host_pinned_memory_resource;
+
+/*! \} // memory_resources
+ */
+
+}} // namespace system::tbb
+
+THRUST_NAMESPACE_END
diff --git a/thrust/system/tbb/pointer.h b/thrust/system/tbb/pointer.h
new file mode 100644
index 000000000..065e1a548
--- /dev/null
+++ b/thrust/system/tbb/pointer.h
@@ -0,0 +1,117 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file thrust/system/tbb/memory.h
+ *  \brief Managing memory associated with Thrust's TBB system.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <type_traits>
+#include <thrust/system/tbb/detail/execution_policy.h>
+#include <thrust/detail/pointer.h>
+#include <thrust/detail/reference.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
+{
+
+/*! \p tbb::pointer stores a pointer to an object allocated in memory accessible
+ *  by the \p tbb system. This type provides type safety when dispatching
+ *  algorithms on ranges resident in \p tbb memory.
+ *
+ *  \p tbb::pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p tbb::pointer can be created with the function \p tbb::malloc, or by
+ *  explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p tbb::pointer may be obtained by eiter its
+ *  <tt>get</tt> member function or the \p raw_pointer_cast function.
+ *
+ *  \note \p tbb::pointer is not a "smart" pointer; it is the programmer's
+ *        responsibility to deallocate memory pointed to by \p tbb::pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see tbb::malloc
+ *  \see tbb::free
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using pointer = thrust::pointer<
+  T,
+  thrust::system::tbb::tag,
+  thrust::tagged_reference<T, thrust::system::tbb::tag>
+>;
+
+/*! \p tbb::universal_pointer stores a pointer to an object allocated in memory
+ * accessible by the \p tbb system and host systems.
+ *
+ *  \p tbb::universal_pointer has pointer semantics: it may be dereferenced and
+ *  manipulated with pointer arithmetic.
+ *
+ *  \p tbb::universal_pointer can be created with \p tbb::universal_allocator
+ *  or by explicitly calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p tbb::universal_pointer may be obtained
+ *  by eiter its <tt>get</tt> member function or the \p raw_pointer_cast
+ *  function.
+ *
+ *  \note \p tbb::universal_pointer is not a "smart" pointer; it is the
+ *        programmer's responsibility to deallocate memory pointed to by
+ *        \p tbb::universal_pointer.
+ *
+ *  \tparam T specifies the type of the pointee.
+ *
+ *  \see tbb::universal_allocator
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_pointer = thrust::pointer<
+  T,
+  thrust::system::tbb::tag,
+  typename std::add_lvalue_reference<T>::type
+>;
+
+/*! \p reference is a wrapped reference to an object stored in memory available
+ *  to the \p tbb system. \p reference is the type of the result of
+ *  dereferencing a \p tbb::pointer.
+ *
+ *  \tparam T Specifies the type of the referenced object.
+ */
+template <typename T>
+using reference = thrust::tagged_reference<T, thrust::system::tbb::tag>;
+
+}} // namespace system::tbb
+
+/*! \addtogroup system_backends Systems
+ *  \ingroup system
+ *  \{
+ */
+
+/*! \namespace thrust::tbb
+ *  \brief \p thrust::tbb is a top-level alias for \p thrust::system::tbb. */
+namespace tbb
+{
+using thrust::system::tbb::pointer;
+using thrust::system::tbb::universal_pointer;
+using thrust::system::tbb::reference;
+} // namespace tbb
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/system/tbb/vector.h b/thrust/system/tbb/vector.h
index 8607f740b..8cbbabbd2 100644
--- a/thrust/system/tbb/vector.h
+++ b/thrust/system/tbb/vector.h
@@ -2,7 +2,7 @@
  *  Copyright 2008-2013 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in ctbbliance with the License.
+ *  you may not use this file except in compliance with the License.
  *  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
@@ -26,119 +26,57 @@
 #include <thrust/detail/vector_base.h>
 #include <vector>
 
-namespace thrust
+THRUST_NAMESPACE_BEGIN
+namespace system { namespace tbb
 {
-namespace system
-{
-namespace tbb
-{
-
-// XXX upon c++11
-// template<typename T, typename Allocator = allocator<T> > using vector = thrust::detail::vector_base<T,Allocator>;
 
 /*! \p tbb::vector is a container that supports random access to elements,
  *  constant time removal of elements at the end, and linear time insertion
  *  and removal of elements at the beginning or in the middle. The number of
  *  elements in a \p tbb::vector may vary dynamically; memory management is
  *  automatic. The elements contained in a \p tbb::vector reside in memory
- *  available to the \p tbb system.
+ *  accessible by the \p tbb system.
  *
  *  \tparam T The element type of the \p tbb::vector.
- *  \tparam Allocator The allocator type of the \p tbb::vector. Defaults to \p tbb::allocator.
+ *  \tparam Allocator The allocator type of the \p tbb::vector.
+ *          Defaults to \p tbb::allocator.
  *
- *  \see http://www.sgi.com/tech/stl/Vector.html
+ *  \see https://en.cppreference.com/w/cpp/container/vector
  *  \see host_vector For the documentation of the complete interface which is
- *                   shared by \p tbb::vector
+ *                   shared by \p tbb::vector.
  *  \see device_vector
+ *  \see universal_vector
  */
-template<typename T, typename Allocator = allocator<T> >
-  class vector
-    : public thrust::detail::vector_base<T,Allocator>
-{
-  /*! \cond
-   */
-  private:
-    typedef thrust::detail::vector_base<T,Allocator> super_t;
-  /*! \endcond
-   */
-
-  public:
-
-  /*! \cond
-   */
-    typedef typename super_t::size_type  size_type;
-    typedef typename super_t::value_type value_type;
-  /*! \endcond
-   */
-
-    /*! This constructor creates an empty \p tbb::vector.
-     */
-    vector();
-
-    /*! This constructor creates a \p tbb::vector with \p n default-constructed elements.
-     *  \param n The size of the \p tbb::vector to create.
-     */
-    explicit vector(size_type n);
-
-    /*! This constructor creates a \p tbb::vector with \p n copies of \p value.
-     *  \param n The size of the \p tbb::vector to create.
-     *  \param value An element to copy.
-     */
-    explicit vector(size_type n, const value_type &value);
-
-    /*! Copy constructor copies from another \p tbb::vector.
-     *  \param x The other \p tbb::vector to copy.
-     */
-    vector(const vector &x);
-
-    /*! This constructor copies from another Thrust vector-like object.
-     *  \param x The other object to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-
-    /*! This constructor copies from a \c std::vector.
-     *  \param x The \c std::vector to copy from.
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! This constructor creates a \p tbb::vector by copying from a range.
-     *  \param first The beginning of the range.
-     *  \param last The end of the range.
-     */
-    template<typename InputIterator>
-    vector(InputIterator first, InputIterator last);
-
-    // XXX vector_base should take a Derived type so we don't have to define these superfluous assigns
-
-    /*! Assignment operator assigns from a \c std::vector.
-     *  \param x The \c std::vector to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const std::vector<OtherT,OtherAllocator> &x);
-
-    /*! Assignment operator assigns from another Thrust vector-like object.
-     *  \param x The other object to assign from.
-     *  \return <tt>*this</tt>
-     */
-    template<typename OtherT, typename OtherAllocator>
-    vector &operator=(const thrust::detail::vector_base<OtherT,OtherAllocator> &x);
-}; // end vector
+template <typename T, typename Allocator = thrust::system::tbb::allocator<T>>
+using vector = thrust::detail::vector_base<T, Allocator>;
+
+/*! \p tbb::universal_vector is a container that supports random access to
+ *  elements, constant time removal of elements at the end, and linear time
+ *  insertion and removal of elements at the beginning or in the middle. The
+ *  number of elements in a \p tbb::universal_vector may vary dynamically;
+ *  memory management is automatic. The elements contained in a
+ *  \p tbb::universal_vector reside in memory accessible by the \p tbb system
+ *  and host systems.
+ *
+ *  \tparam T The element type of the \p tbb::universal_vector.
+ *  \tparam Allocator The allocator type of the \p tbb::universal_vector.
+ *          Defaults to \p tbb::universal_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p tbb::universal_vector
+ *  \see device_vector
+ *  \see universal_vector
+ */
+template <typename T, typename Allocator = thrust::system::tbb::universal_allocator<T>>
+using universal_vector = thrust::detail::vector_base<T, Allocator>;
 
-} // end tbb
-} // end system
+}} // namespace system::tbb
 
-// alias system::tbb names at top-level
 namespace tbb
 {
-
 using thrust::system::tbb::vector;
+using thrust::system::tbb::universal_vector;
+}
 
-} // end tbb
-
-} // end thrust
-
-#include <thrust/system/tbb/detail/vector.inl>
-
+THRUST_NAMESPACE_END
diff --git a/thrust/system_error.h b/thrust/system_error.h
index 5c1e72b43..6bf240e51 100644
--- a/thrust/system_error.h
+++ b/thrust/system_error.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2021 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,19 +22,18 @@
 
 #include <thrust/detail/config.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
-/*! \addtogroup system System Access
+/*! \addtogroup system
  *  \{
  */
 
 /*! \namespace thrust::system
- *  \brief \p thrust::system is the namespace which contains functionality for manipulating
- *         memory specific to one of Thrust's backend systems. It also contains functionality
- *         for reporting error conditions originating from the operating system or other
- *         low-level application program interfaces such as the CUDA runtime.
- *         They are provided in a separate namespace for import convenience but are
+ *  \brief \p thrust::system is the namespace which contains specific Thrust
+ *         backend systems. It also contains functionality for reporting error
+ *         conditions originating from the operating system or other low-level
+ *         application program interfaces such as the CUDA runtime. They are
+ *         provided in a separate namespace for import convenience but are
  *         also aliased in the top-level \p thrust namespace for easy access.
  */
 namespace system
@@ -44,8 +43,7 @@ namespace system
 /*! \} // end system
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/system/error_code.h>
 #include <thrust/system/system_error.h>
-
diff --git a/thrust/tabulate.h b/thrust/tabulate.h
index 1dcd2c9ee..7cb794550 100644
--- a/thrust/tabulate.h
+++ b/thrust/tabulate.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup transformations
  *  \{
@@ -47,11 +45,11 @@ namespace thrust
  *  \param unary_op The unary operation to apply.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam UnaryOperation is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers
@@ -90,11 +88,11 @@ __host__ __device__
  *  \param last The end of the range.
  *  \param unary_op The unary operation to apply.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and if \c x and \c y are objects of \c ForwardIterator's \c value_type, then <tt>x + y</tt> is defined,
  *          and if \c T is \p ForwardIterator's \c value_type, then <tt>T(0)</tt> is defined.
- *  \tparam UnaryOperation is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam UnaryOperation is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                         and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  The following code snippet demonstrates how to use \p tabulate to generate the first \c n non-positive integers:
@@ -122,8 +120,6 @@ template<typename ForwardIterator, typename UnaryOperation>
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/tabulate.inl>
-
diff --git a/thrust/transform.h b/thrust/transform.h
index 16e0a030f..2d064c13b 100644
--- a/thrust/transform.h
+++ b/thrust/transform.h
@@ -15,7 +15,7 @@
  */
 
 
-/*! \file transform.h
+/*! \file thrust/transform.h
  *  \brief Transforms input ranges using a function object
  */
 
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -52,14 +50,14 @@ namespace thrust
  *  \param first The beginning of the input sequence.
  *  \param last The end of the input sequence.
  *  \param result The beginning of the output sequence.
- *  \param op The tranformation operation.
+ *  \param op The transformation operation.
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -82,7 +80,7 @@ namespace thrust
  *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -110,10 +108,10 @@ __host__ __device__
  *  \param op The tranformation operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                              and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -133,7 +131,7 @@ __host__ __device__
  *  // data is now {5, 0, -2, 3, -2, -4, 0, 1, -2, -8};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -165,12 +163,12 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -196,7 +194,7 @@ template<typename InputIterator,
  *  // output is now {-2,  6,  0,  4,  4,  7};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename DerivedPolicy,
          typename InputIterator1,
@@ -229,12 +227,12 @@ __host__ __device__
  *  \param op The tranformation operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                        and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -257,7 +255,7 @@ __host__ __device__
  *  // output is now {-2,  6,  0,  4,  4,  7};
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/transform.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/transform
  */
 template<typename InputIterator1,
          typename InputIterator2,
@@ -294,13 +292,13 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *
@@ -369,13 +367,13 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *                        and \c InputIterator's \c value_type is convertible to \c Predicate's \c argument_type,
  *                        and \c InputIterator's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *
@@ -444,14 +442,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -516,14 +514,14 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c UnaryFunction's \c argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c Predicate's \c argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                        and \c UnaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first may equal \p result, but the range <tt>[first, last)</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
  *  \pre \p stencil may equal \p result, but the range <tt>[stencil, stencil + (last - first))</tt> shall not overlap the range <tt>[result, result + (last - first))</tt> otherwise.
@@ -588,14 +586,14 @@ template<typename InputIterator1,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
  *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -667,14 +665,14 @@ __host__ __device__
  *  \param pred The predicate operation.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator1's \c value_type is convertible to \c BinaryFunction's \c first_argument_type.
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                         and \c InputIterator2's \c value_type is convertible to \c BinaryFunction's \c second_argument_type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>.
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                         and \c BinaryFunction's \c result_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam Predicate is a model of <a href="http://www.sgi.com/tech/stl/Predicate.html">Predicate</a>.
+ *  \tparam Predicate is a model of <a href="https://en.cppreference.com/w/cpp/concepts/predicate">Predicate</a>.
  *
  *  \pre \p first1 may equal \p result, but the range <tt>[first1, last1)</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
  *  \pre \p first2 may equal \p result, but the range <tt>[first2, first2 + (last1 - first1))</tt> shall not overlap the range <tt>[result, result + (last1 - first1))</tt> otherwise.
@@ -718,8 +716,6 @@ template<typename InputIterator1,
 /*! \} // end transformations
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform.inl>
-
diff --git a/thrust/transform_reduce.h b/thrust/transform_reduce.h
index 32e172d1e..11d6b84c3 100644
--- a/thrust/transform_reduce.h
+++ b/thrust/transform_reduce.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup reductions
  *  \{
@@ -61,13 +59,13 @@ namespace thrust
  *  \return The result of the transformed reduction.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p transform_reduce
@@ -137,13 +135,13 @@ __host__ __device__
  *  \param binary_op The reduction operation.
  *  \return The result of the transformed reduction.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
  *          and \p InputIterator's \c value_type is convertible to \p UnaryFunction's \c argument_type.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>,
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>,
  *          and \p UnaryFunction's \c result_type is convertible to \c OutputType.
- *  \tparam OutputType is a model of <a href="http://www.sgi.com/tech/stl/Assignable.html">Assignable</a>,
+ *  \tparam OutputType is a model of <a href="https://en.cppreference.com/w/cpp/named_req/CopyAssignable">Assignable</a>,
  *          and is convertible to \p BinaryFunction's \c first_argument_type and \c second_argument_type.
- *  \tparam BinaryFunction is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>,
+ *  \tparam BinaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>,
  *          and \p BinaryFunction's \c result_type is convertible to \p OutputType.
  *
  *  The following code snippet demonstrates how to use \p transform_reduce
@@ -191,8 +189,6 @@ template<typename InputIterator,
  *  \} // end reductions
  */
 
-
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform_reduce.inl>
-
diff --git a/thrust/transform_scan.h b/thrust/transform_scan.h
index 8bb883d54..6c0fe8116 100644
--- a/thrust/transform_scan.h
+++ b/thrust/transform_scan.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup algorithms
  */
@@ -64,13 +62,13 @@ namespace thrust
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -130,13 +128,13 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' transformed values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -195,14 +193,14 @@ template<typename InputIterator,
  *  \return The end of the output sequence.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -265,14 +263,14 @@ __host__ __device__
  *  \param binary_op The associatve operator used to 'sum' transformed values.
  *  \return The end of the output sequence.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>
  *                               and \c InputIterator's \c value_type is convertible to \c unary_op's input type.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a>.
- *  \tparam UnaryFunction is a model of <a href="http://www.sgi.com/tech/stl/UnaryFunction.html">Unary Function</a>
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a>.
+ *  \tparam UnaryFunction is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/unary_function">Unary Function</a>
  *                               and accepts inputs of \c InputIterator's \c value_type.  \c UnaryFunction's result_type
  *                               is convertable to \c OutputIterator's \c value_type.
  *  \tparam T is convertible to \c OutputIterator's \c value_type.
- *  \tparam AssociativeOperator is a model of <a href="http://www.sgi.com/tech/stl/BinaryFunction.html">Binary Function</a>
+ *  \tparam AssociativeOperator is a model of <a href="https://en.cppreference.com/w/cpp/utility/functional/binary_function">Binary Function</a>
  *                              and \c AssociativeOperator's \c result_type is
  *                              convertible to \c OutputIterator's \c value_type.
  *
@@ -317,8 +315,6 @@ template<typename InputIterator,
 /*! \} // end prefixsums
  */
 
-	
-} // end namespace thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/transform_scan.inl>
-
diff --git a/thrust/tuple.h b/thrust/tuple.h
index 3e12ed015..04f3154a3 100644
--- a/thrust/tuple.h
+++ b/thrust/tuple.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2018 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -16,12 +16,12 @@
 
 
 /*! \file tuple.h
- *  \brief A type encapsulating a heterogeneous collection of elements
+ *  \brief A type encapsulating a heterogeneous collection of elements.
  */
 
 /*
  * Copyright (C) 1999, 2000 Jaakko Järvi (jaakko.jarvi@cs.utu.fi)
- * 
+ *
  * Distributed under the Boost Software License, Version 1.0.
  * (See accompanying NOTICE file for the complete license)
  *
@@ -34,8 +34,7 @@
 #include <thrust/detail/tuple.inl>
 #include <thrust/pair.h>
 
-namespace thrust
-{
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup utility
  *  \{
@@ -62,17 +61,7 @@ struct null_type;
  *  \see pair
  *  \see tuple
  */
-template<int N, class T>
-  struct tuple_element
-{
-  private:
-    typedef typename T::tail_type Next;
-
-  public:
-    /*! The result of this metafunction is returned in \c type.
-     */
-    typedef typename tuple_element<N-1, Next>::type type;
-}; // end tuple_element
+template <size_t N, class T> struct tuple_element;
 
 /*! This metafunction returns the number of elements
  *  of a \p tuple type of interest.
@@ -82,13 +71,8 @@ template<int N, class T>
  *  \see pair
  *  \see tuple
  */
-template<class T>
-  struct tuple_size
-{
-  /*! The result of this metafunction is returned in \c value.
-   */
-  static const int value = 1 + tuple_size<typename T::tail_type>::value;
-}; // end tuple_size
+template <class T> struct tuple_size;
+
 
 // get function for non-const cons-lists, returns a reference to the element
 
@@ -155,12 +139,12 @@ get(const detail::cons<HT, TT>& t);
 
 
 
-/*! \p tuple is a class template that can be instantiated with up to ten arguments.
- *  Each template argument specifies the type of element in the \p tuple.
- *  Consequently, tuples are heterogeneous, fixed-size collections of values. An
- *  instantiation of \p tuple with two arguments is similar to an instantiation
- *  of \p pair with the same two arguments. Individual elements of a \p tuple may
- *  be accessed with the \p get function.
+/*! \brief \p tuple is a class template that can be instantiated with up to ten
+ *  arguments. Each template argument specifies the type of element in the \p
+ *  tuple. Consequently, tuples are heterogeneous, fixed-size collections of
+ *  values. An instantiation of \p tuple with two arguments is similar to an
+ *  instantiation of \p pair with the same two arguments. Individual elements
+ *  of a \p tuple may be accessed with the \p get function.
  *
  *  \tparam TN The type of the <tt>N</tt> \c tuple element. Thrust's \p tuple
  *          type currently supports up to ten elements.
@@ -171,18 +155,20 @@ get(const detail::cons<HT, TT>& t);
  *  \code
  *  #include <thrust/tuple.h>
  *  #include <iostream>
- *  ...
- *  // create a tuple containing an int, a float, and a string
- *  thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
+ *  
+ *  int main() {
+ *    // Create a tuple containing an `int`, a `float`, and a string.
+ *    thrust::tuple<int, float, const char*> t(13, 0.1f, "thrust");
  *
- *  // individual members are accessed with the free function get
- *  std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl; 
+ *    // Individual members are accessed with the free function `get`.
+ *    std::cout << "The first element's value is " << thrust::get<0>(t) << std::endl;
  *
- *  // or the member function get
- *  std::cout << "The second element's value is " << t.get<1>() << std::endl;
+ *    // ... or the member function `get`.
+ *    std::cout << "The second element's value is " << t.get<1>() << std::endl;
  *
- *  // we can also modify elements with the same function
- *  thrust::get<0>(t) += 10;
+ *    // We can also modify elements with the same function.
+ *    thrust::get<0>(t) += 10;
+ *  }
  *  \endcode
  *
  *  \see pair
@@ -194,8 +180,12 @@ get(const detail::cons<HT, TT>& t);
  */
 template <class T0, class T1, class T2, class T3, class T4,
           class T5, class T6, class T7, class T8, class T9>
-  class tuple :
-    public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+  class tuple
+  /*! \cond
+   */
+    : public detail::map_tuple_to_cons<T0, T1, T2, T3, T4, T5, T6, T7, T8, T9>::type
+  /*! \endcond
+   */
 {
   /*! \cond
    */
@@ -207,6 +197,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    */
 
   public:
+
   /*! \p tuple's no-argument constructor initializes each element.
    */
   inline __host__ __device__
@@ -216,7 +207,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    *     and intializes all other elements.
    *  \param t0 The value to assign to this \p tuple's first element.
    */
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0)
     : inherited(t0,
                 static_cast<const null_type&>(null_type()),
@@ -235,7 +226,7 @@ template <class T0, class T1, class T2, class T3, class T4,
    *  \param t1 The value to assign to this \p tuple's second element.
    *  \note \p tuple's constructor has ten variants of this form, the rest of which are ommitted here for brevity.
    */
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1)
     : inherited(t0, t1,
@@ -251,7 +242,7 @@ template <class T0, class T1, class T2, class T3, class T4,
   /*! \cond
    */
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2)
@@ -264,7 +255,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -277,7 +268,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -290,7 +281,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -303,7 +294,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -316,7 +307,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -329,7 +320,7 @@ template <class T0, class T1, class T2, class T3, class T4,
                 static_cast<const null_type&>(null_type()),
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -342,7 +333,7 @@ template <class T0, class T1, class T2, class T3, class T4,
     : inherited(t0, t1, t2, t3, t4, t5, t6, t7, t8,
                 static_cast<const null_type&>(null_type())) {}
 
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(typename access_traits<T0>::parameter_type t0,
         typename access_traits<T1>::parameter_type t1,
         typename access_traits<T2>::parameter_type t2,
@@ -357,11 +348,12 @@ template <class T0, class T1, class T2, class T3, class T4,
 
 
   template<class U1, class U2>
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple(const detail::cons<U1, U2>& p) : inherited(p) {}
 
+  __thrust_exec_check_disable__
   template <class U1, class U2>
-  inline __host__ __device__ 
+  inline __host__ __device__
   tuple& operator=(const detail::cons<U1, U2>& k)
   {
     inherited::operator=(k);
@@ -374,6 +366,7 @@ template <class T0, class T1, class T2, class T3, class T4,
   /*! This assignment operator allows assigning the first two elements of this \p tuple from a \p pair.
    *  \param k A \p pair to assign from.
    */
+  __thrust_exec_check_disable__
   template <class U1, class U2>
   __host__ __device__ inline
   tuple& operator=(const thrust::pair<U1, U2>& k) {
@@ -579,5 +572,4 @@ bool operator>(const null_type&, const null_type&);
 /*! \} // utility
  */
 
-} // end thrust
-
+THRUST_NAMESPACE_END
diff --git a/thrust/type_traits/integer_sequence.h b/thrust/type_traits/integer_sequence.h
new file mode 100644
index 000000000..26ea54213
--- /dev/null
+++ b/thrust/type_traits/integer_sequence.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief C++14's
+ *  <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::index_sequence</tt></a>,
+ *  associated helper aliases, and some related extensions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <type_traits>
+#include <utility>
+#include <cstdint>
+#include <utility>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \brief A compile-time sequence of
+ *  <a href="https://en.cppreference.com/w/cpp/language/constant_expression#Integral_constant_expression"><i>integral constants</i></a>
+ *  of type \c T with values <tt>Is...</tt>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/language/constant_expression#Integral_constant_expression"><i>integral constants</i></a>
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see integer_sequence_push_front
+ *  \see integer_sequence_push_back
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::integer_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T, T... Is>
+using integer_sequence = std::integer_sequence<T, Is...>;
+#else
+template <typename T, T... Is>
+struct integer_sequence
+{
+  using type = integer_sequence;
+  using value_type = T;
+  using size_type = std::size_t;
+
+  __host__ __device__
+  static constexpr size_type size() noexcept
+  {
+    return sizeof...(Is);
+  }
+};
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief A compile-time sequence of type
+ *  <a href="https://en.cppreference.com/w/cpp/types/size_t">std::size_t</a>
+ *  with values <tt>Is...</tt>.
+ *
+ *  \see integer_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see integer_sequence_push_front
+ *  \see integer_sequence_push_back
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::index_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <std::size_t... Is>
+using index_sequence = std::index_sequence<Is...>;
+#else
+template <std::size_t... Is>
+using index_sequence = integer_sequence<std::size_t, Is...>;
+#endif
+
+#if THRUST_CPP_DIALECT < 2014
+/*! \cond
+ */
+
+namespace detail
+{
+
+/*! \brief Create a new \c integer_sequence containing the elements of \c
+ * Sequence0 followed by the elements of \c Sequence1. \c Sequence0::size() is
+ * added to each element from \c Sequence1 in the new sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see merge_and_renumber_reversed_integer_sequences_impl
+ */
+template <typename Sequence0, typename Sequence1>
+  struct merge_and_renumber_integer_sequences_impl;
+template <typename Sequence0, typename Sequence1>
+  using merge_and_renumber_integer_sequences =
+      typename merge_and_renumber_integer_sequences_impl<
+          Sequence0, Sequence1
+      >::type;
+
+template <typename T, std::size_t N>
+  struct make_integer_sequence_impl;
+
+} // namespace detail
+
+/*! \endcond
+ */
+#endif
+
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>0, 1, 2, ..., N - 1</tt> of type \c T.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::make_integer_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T, std::size_t N>
+using make_integer_sequence = std::make_integer_sequence<T, N>;
+#else
+template <typename T, std::size_t N>
+using make_integer_sequence =
+  typename detail::make_integer_sequence_impl<T, N>::type;
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T, T... Is0, T... Is1>
+struct merge_and_renumber_integer_sequences_impl<
+  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
+>
+{
+  using type = integer_sequence<T, Is0..., (sizeof...(Is0) + Is1)...>;
+};
+
+template <typename T, std::size_t N>
+struct make_integer_sequence_impl
+{
+  using type = merge_and_renumber_integer_sequences<
+    make_integer_sequence<T, N / 2>
+  , make_integer_sequence<T, N - N / 2>
+  >;
+};
+
+template <typename T>
+struct make_integer_sequence_impl<T, 0>
+{
+  using type = integer_sequence<T>;
+};
+
+template <typename T>
+struct make_integer_sequence_impl<T, 1>
+{
+  using type = integer_sequence<T, 0>;
+};
+
+} // namespace detail
+
+/*! \endcond
+ */
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>0, 1, 2, ..., N - 1</tt> of type
+ *  <a href="https://en.cppreference.com/w/cpp/types/size_t">std::size_t</a>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_reversed_index_sequence
+ *  \see <a href="https://en.cppreference.com/w/cpp/utility/integer_sequence"><tt>std::make_index_sequence</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <std::size_t N>
+using make_index_sequence = std::make_index_sequence<N>;
+#else
+template <std::size_t N>
+using make_index_sequence =
+  make_integer_sequence<std::size_t, N>;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+/*! \brief Create a new \c integer_sequence containing the elements of \c
+ *  Sequence0 followed by the elements of \c Sequence1. \c Sequence1::size() is
+ *  added to each element from \c Sequence0 in the new sequence.
+ *
+ *  \see make_reversed_integer_sequence
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ *  \see merge_and_renumber_integer_sequences_impl
+ */
+template <typename Sequence0, typename Sequence1>
+  struct merge_and_renumber_reversed_integer_sequences_impl;
+template <typename Sequence0, typename Sequence1>
+  using merge_and_renumber_reversed_integer_sequences =
+      typename merge_and_renumber_reversed_integer_sequences_impl<
+          Sequence0, Sequence1
+      >::type;
+
+template <typename T, std::size_t N>
+struct make_reversed_integer_sequence_impl;
+
+template <typename T, T Value, typename Sequence>
+struct integer_sequence_push_front_impl;
+
+template <typename T, T Value, typename Sequence>
+struct integer_sequence_push_back_impl;
+
+template <typename T, T... Is0, T... Is1>
+struct merge_and_renumber_reversed_integer_sequences_impl<
+  integer_sequence<T, Is0...>, integer_sequence<T, Is1...>
+>
+{
+  using type = integer_sequence<T, (sizeof...(Is1) + Is0)..., Is1...>;
+};
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief Create a new \c integer_sequence with elements
+ *  <tt>N - 1, N - 2, N - 3, ..., 0</tt>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ *  \see make_reversed_index_sequence
+ */
+template <typename T, std::size_t N>
+using make_reversed_integer_sequence =
+  typename detail::make_reversed_integer_sequence_impl<T, N>::type;
+
+/*! \brief Create a new \c index_sequence with elements
+ *  <tt>N - 1, N - 2, N - 3, ..., 0</tt>.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_reversed_integer_sequence
+ *  \see make_reversed_index_sequence
+ */
+template <std::size_t N>
+using make_reversed_index_sequence =
+  make_reversed_integer_sequence<std::size_t, N>;
+
+/*! \brief Add a new element to the front of an \c integer_sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ */
+template <typename T, T Value, typename Sequence>
+using integer_sequence_push_front =
+  typename detail::integer_sequence_push_front_impl<T, Value, Sequence>::type;
+
+/*! \brief Add a new element to the back of an \c integer_sequence.
+ *
+ *  \see integer_sequence
+ *  \see index_sequence
+ *  \see make_integer_sequence
+ *  \see make_index_sequence
+ */
+template <typename T, T Value, typename Sequence>
+using integer_sequence_push_back =
+  typename detail::integer_sequence_push_back_impl<T, Value, Sequence>::type;
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T, std::size_t N>
+struct make_reversed_integer_sequence_impl
+{
+  using type = merge_and_renumber_reversed_integer_sequences<
+      make_reversed_integer_sequence<T, N / 2>
+    , make_reversed_integer_sequence<T, N - N / 2>
+  >;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+struct make_reversed_integer_sequence_impl<T, 0>
+{
+  using type = integer_sequence<T>;
+};
+
+template <typename T>
+struct make_reversed_integer_sequence_impl<T, 1>
+{
+  using type = integer_sequence<T, 0>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, T I0, T... Is>
+struct integer_sequence_push_front_impl<T, I0, integer_sequence<T, Is...> >
+{
+  using type = integer_sequence<T, I0, Is...>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename T, T I0, T... Is>
+struct integer_sequence_push_back_impl<T, I0, integer_sequence<T, Is...> >
+{
+  using type = integer_sequence<T, Is..., I0>;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/type_traits/is_contiguous_iterator.h b/thrust/type_traits/is_contiguous_iterator.h
new file mode 100644
index 000000000..eaa088978
--- /dev/null
+++ b/thrust/type_traits/is_contiguous_iterator.h
@@ -0,0 +1,295 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief An extensible type trait for determining if an iterator satisifies the
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+ *  requirements (aka is pointer-like).
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+#include <iterator>
+#include <type_traits>
+#include <utility>
+
+#if THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_MSVC && _MSC_VER < 1916 // MSVC 2017 version 15.9
+  #include <vector>
+  #include <string>
+  #include <array>
+
+  #if THRUST_CPP_DIALECT >= 2017
+    #include <string_view>
+  #endif
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename Iterator>
+struct is_contiguous_iterator_impl;
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory, and \c false_type
+ *  otherwise.
+ *
+ * \see is_contiguous_iterator_v
+ * \see proclaim_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
+template <typename Iterator>
+#if THRUST_CPP_DIALECT >= 2011
+using is_contiguous_iterator =
+#else
+struct is_contiguous_iterator :
+#endif
+  detail::is_contiguous_iterator_impl<Iterator>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory, and \c false
+ *  otherwise.
+ *
+ * \see is_contiguous_iterator
+ * \see proclaim_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
+template <typename Iterator>
+constexpr bool is_contiguous_iterator_v = is_contiguous_iterator<Iterator>::value;
+#endif
+
+/*! \brief Customization point that can be customized to indicate that an
+ *  iterator type \c Iterator satisfies
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>,
+ *  aka it points to elements that are contiguous in memory.
+ *
+ * \see is_contiguous_iterator
+ * \see THRUST_PROCLAIM_CONTIGUOUS_ITERATOR
+ */
+template <typename Iterator>
+struct proclaim_contiguous_iterator : false_type {};
+
+/*! \brief Declares that the iterator \c Iterator is
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/ContiguousIterator">ContiguousIterator</a>
+ *  by specializing \c proclaim_contiguous_iterator.
+ *
+ * \see is_contiguous_iterator
+ * \see proclaim_contiguous_iterator
+ */
+#define THRUST_PROCLAIM_CONTIGUOUS_ITERATOR(Iterator)                         \
+  THRUST_NAMESPACE_BEGIN                                                      \
+  template <>                                                                 \
+  struct proclaim_contiguous_iterator<Iterator>                               \
+      : THRUST_NS_QUALIFIER::true_type {};                                    \
+  THRUST_NAMESPACE_END                                                        \
+  /**/
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename Iterator>
+struct is_libcxx_wrap_iter : false_type {};
+
+#if defined(_LIBCPP_VERSION)
+template <typename Iterator>
+struct is_libcxx_wrap_iter<
+  _VSTD::__wrap_iter<Iterator>
+> : true_type {};
+#endif
+
+template <typename Iterator>
+struct is_libstdcxx_normal_iterator : false_type {};
+
+#if defined(__GLIBCXX__)
+template <typename Iterator, typename Container>
+struct is_libstdcxx_normal_iterator<
+  ::__gnu_cxx::__normal_iterator<Iterator, Container>
+> : true_type {};
+#endif
+
+#if   _MSC_VER >= 1916 // MSVC 2017 version 15.9.
+template <typename Iterator>
+struct is_msvc_contiguous_iterator
+  : is_pointer<::std::_Unwrapped_t<Iterator> > {};
+#elif _MSC_VER >= 1700 // MSVC 2012.
+template <typename Iterator>
+struct is_msvc_contiguous_iterator : false_type {};
+
+template <typename Vector>
+struct is_msvc_contiguous_iterator<
+  ::std::_Vector_const_iterator<Vector>
+> : true_type {};
+
+template <typename Vector>
+struct is_msvc_contiguous_iterator<
+  ::std::_Vector_iterator<Vector>
+> : true_type {};
+
+template <typename String>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_const_iterator<String>
+> : true_type {};
+
+template <typename String>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_iterator<String>
+> : true_type {};
+
+template <typename T, std::size_t N>
+struct is_msvc_contiguous_iterator<
+  ::std::_Array_const_iterator<T, N>
+> : true_type {};
+
+template <typename T, std::size_t N>
+struct is_msvc_contiguous_iterator<
+  ::std::_Array_iterator<T, N>
+> : true_type {};
+
+#if THRUST_CPP_DIALECT >= 2017
+template <typename Traits>
+struct is_msvc_contiguous_iterator<
+  ::std::_String_view_iterator<Traits>
+> : true_type {};
+#endif
+#else
+template <typename Iterator>
+struct is_msvc_contiguous_iterator : false_type {};
+#endif
+
+template <typename Iterator>
+struct is_contiguous_iterator_impl
+  : integral_constant<
+      bool
+    ,    is_pointer<Iterator>::value
+      || is_thrust_pointer<Iterator>::value
+      || is_libcxx_wrap_iter<Iterator>::value
+      || is_libstdcxx_normal_iterator<Iterator>::value
+      || is_msvc_contiguous_iterator<Iterator>::value
+      || proclaim_contiguous_iterator<Iterator>::value
+    >
+{};
+
+// Type traits for contiguous iterators:
+template <typename Iterator>
+struct contiguous_iterator_traits
+{
+  static_assert(thrust::is_contiguous_iterator<Iterator>::value,
+                "contiguous_iterator_traits requires a contiguous iterator.");
+
+  using raw_pointer = typename thrust::detail::pointer_traits<
+    decltype(&*std::declval<Iterator>())>::raw_pointer;
+};
+
+template <typename Iterator>
+using contiguous_iterator_raw_pointer_t =
+  typename contiguous_iterator_traits<Iterator>::raw_pointer;
+
+// Converts a contiguous iterator to a raw pointer:
+template <typename Iterator>
+__host__ __device__
+contiguous_iterator_raw_pointer_t<Iterator>
+contiguous_iterator_raw_pointer_cast(Iterator it)
+{
+  static_assert(thrust::is_contiguous_iterator<Iterator>::value,
+                "contiguous_iterator_raw_pointer_cast called with "
+                "non-contiguous iterator.");
+  return thrust::raw_pointer_cast(&*it);
+}
+
+// Implementation for non-contiguous iterators -- passthrough.
+template <typename Iterator,
+          bool IsContiguous = thrust::is_contiguous_iterator<Iterator>::value>
+struct try_unwrap_contiguous_iterator_impl
+{
+  using type = Iterator;
+
+  static __host__ __device__ type get(Iterator it) { return it; }
+};
+
+// Implementation for contiguous iterators -- unwraps to raw pointer.
+template <typename Iterator>
+struct try_unwrap_contiguous_iterator_impl<Iterator, true /*is_contiguous*/>
+{
+  using type = contiguous_iterator_raw_pointer_t<Iterator>;
+
+  static __host__ __device__ type get(Iterator it)
+  {
+    return contiguous_iterator_raw_pointer_cast(it);
+  }
+};
+
+template <typename Iterator>
+using try_unwrap_contiguous_iterator_return_t =
+  typename try_unwrap_contiguous_iterator_impl<Iterator>::type;
+
+// Casts to a raw pointer if iterator is marked as contiguous, otherwise returns
+// the input iterator.
+template <typename Iterator>
+__host__ __device__
+try_unwrap_contiguous_iterator_return_t<Iterator>
+try_unwrap_contiguous_iterator(Iterator it)
+{
+  return try_unwrap_contiguous_iterator_impl<Iterator>::get(it);
+}
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/type_traits/is_execution_policy.h b/thrust/type_traits/is_execution_policy.h
new file mode 100644
index 000000000..f83751ea2
--- /dev/null
+++ b/thrust/type_traits/is_execution_policy.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief A type trait that determines if a type is an \a ExecutionPolicy.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/execution_policy.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is an \a ExecutionPolicy and \c false_type
+ *  otherwise.
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_execution_policy =
+#else
+struct is_execution_policy :
+#endif
+  detail::is_base_of<detail::execution_policy_marker, T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is an
+ *  \a ExecutionPolicy and \c false otherwise.
+ */
+template <typename T>
+constexpr bool is_execution_policy_v = is_execution_policy<T>::value;
+#endif
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/type_traits/is_operator_less_or_greater_function_object.h b/thrust/type_traits/is_operator_less_or_greater_function_object.h
new file mode 100644
index 000000000..ef5a19f69
--- /dev/null
+++ b/thrust/type_traits/is_operator_less_or_greater_function_object.h
@@ -0,0 +1,208 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Type traits for determining if a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  is equivalent to either \c operator< or \c operator>.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T>
+struct is_operator_less_function_object_impl;
+
+template <typename T>
+struct is_operator_greater_function_object_impl;
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator<, and \c false_type otherwise.
+ *
+ *  \see is_operator_less_function_object_v
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_less_function_object =
+#else
+struct is_operator_less_function_object :
+#endif
+  detail::is_operator_less_function_object_impl<T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator<, and \c false otherwise.
+ *
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+constexpr bool is_operator_less_function_object_v
+  = is_operator_less_function_object<T>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator>, and \c false_type otherwise.
+ *
+ *  \see is_operator_greater_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_greater_function_object =
+#else
+struct is_operator_greater_function_object :
+#endif
+  detail::is_operator_greater_function_object_impl<T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator>, and \c false otherwise.
+ *
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+constexpr bool is_operator_greater_function_object_v
+  = is_operator_greater_function_object<T>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator< or \c operator>, and \c false_type otherwise.
+ *
+ *  \see is_operator_less_or_greater_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_less_or_greater_function_object =
+#else
+struct is_operator_less_or_greater_function_object :
+#endif
+  integral_constant<
+    bool
+  ,    detail::is_operator_less_function_object_impl<T>::value
+    || detail::is_operator_greater_function_object_impl<T>::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">BinaryPredicate</a>
+ *  equivalent to \c operator< or \c operator>, and \c false otherwise.
+ *
+ *  \see is_operator_less_or_greater_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_plus_function_object
+ */
+template <typename T>
+constexpr bool is_operator_less_or_greater_function_object_v
+  = is_operator_less_or_greater_function_object<T>::value;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T>
+struct is_operator_less_function_object_impl                   : false_type {};
+template <typename T>
+struct is_operator_less_function_object_impl<thrust::less<T> > : true_type {};
+template <typename T>
+struct is_operator_less_function_object_impl<std::less<T>    > : true_type {};
+
+template <typename T>
+struct is_operator_greater_function_object_impl                      : false_type {};
+template <typename T>
+struct is_operator_greater_function_object_impl<thrust::greater<T> > : true_type {};
+template <typename T>
+struct is_operator_greater_function_object_impl<std::greater<T>    > : true_type {};
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/type_traits/is_operator_plus_function_object.h b/thrust/type_traits/is_operator_plus_function_object.h
new file mode 100644
index 000000000..800847532
--- /dev/null
+++ b/thrust/type_traits/is_operator_plus_function_object.h
@@ -0,0 +1,118 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief Type traits for determining if a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  is equivalent to \c operator+.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/functional.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/detail/type_traits/pointer_traits.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T>
+struct is_operator_plus_function_object_impl;
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  equivalent to \c operator+, and \c false_type otherwise.
+ *
+ *  \see is_operator_plus_function_object_v
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_operator_plus_function_object =
+#else
+struct is_operator_plus_function_object :
+#endif
+  detail::is_operator_plus_function_object_impl<T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is a
+ *  <a href="https://en.cppreference.com/w/cpp/named_req/FunctionObject">FunctionObject</a>
+ *  equivalent to \c operator<, and \c false otherwise.
+ *
+ *  \see is_operator_plus_function_object
+ *  \see is_operator_less_function_object
+ *  \see is_operator_greater_function_object
+ *  \see is_operator_less_or_greater_function_object
+ */
+template <typename T>
+constexpr bool is_operator_plus_function_object_v
+  = is_operator_plus_function_object<T>::value;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T>
+struct is_operator_plus_function_object_impl                   : false_type {};
+template <typename T>
+struct is_operator_plus_function_object_impl<thrust::plus<T> > : true_type {};
+template <typename T>
+struct is_operator_plus_function_object_impl<std::plus<T>    > : true_type {};
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/type_traits/is_trivially_relocatable.h b/thrust/type_traits/is_trivially_relocatable.h
new file mode 100644
index 000000000..21d1f09d8
--- /dev/null
+++ b/thrust/type_traits/is_trivially_relocatable.h
@@ -0,0 +1,366 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief <a href="https://wg21.link/P1144">P1144</a>'s proposed
+ *  \c std::is_trivially_relocatable, an extensible type trait indicating
+ *  whether a type can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/static_assert.h>
+#include <thrust/detail/type_traits.h>
+#include <thrust/type_traits/is_contiguous_iterator.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+  #include <type_traits>
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+template <typename T>
+struct is_trivially_relocatable_impl;
+
+} // namespace detail
+
+/*! \endcond
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that returns \c true_type if \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_trivially_relocatable_v
+ * \see is_trivially_relocatable_to
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename T>
+#if THRUST_CPP_DIALECT >= 2011
+using is_trivially_relocatable =
+#else
+struct is_trivially_relocatable :
+#endif
+  detail::is_trivially_relocatable_impl<T>
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename T>
+constexpr bool is_trivially_relocatable_v = is_trivially_relocatable<T>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/BinaryTypeTrait"><i>BinaryTypeTrait</i></a>
+ *  that returns \c true_type if \c From is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to \c To, aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_trivially_relocatable_to_v
+ * \see is_trivially_relocatable
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename From, typename To>
+#if THRUST_CPP_DIALECT >= 2011
+using is_trivially_relocatable_to =
+#else
+struct is_trivially_relocatable_to :
+#endif
+  integral_constant<
+    bool
+  , detail::is_same<From, To>::value && is_trivially_relocatable<To>::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if \c From is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to \c To, aka can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_indirectly_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename From, typename To>
+constexpr bool is_trivially_relocatable_to_v
+  = is_trivially_relocatable_to<From, To>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/BinaryTypeTrait"><i>BinaryTypeTrait</i></a>
+ *  that returns \c true_type if the element type of \c FromIterator is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to the element type of \c ToIterator, aka can be bitwise copied with a
+ *  facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false_type otherwise.
+ *
+ * \see is_indirectly_trivially_relocatable_to_v
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename FromIterator, typename ToIterator>
+#if THRUST_CPP_DIALECT >= 2011
+using is_indirectly_trivially_relocatable_to =
+#else
+struct is_indirectly_trivially_relocatable_to :
+#endif
+  integral_constant<
+    bool
+  ,    is_contiguous_iterator<FromIterator>::value
+    && is_contiguous_iterator<ToIterator>::value
+    && is_trivially_relocatable_to<
+         typename thrust::iterator_traits<FromIterator>::value_type,
+         typename thrust::iterator_traits<ToIterator>::value_type
+       >::value
+  >
+#if THRUST_CPP_DIALECT < 2011
+{}
+#endif
+;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> that is \c true if the element type of
+ *  \c FromIterator is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  to the element type of \c ToIterator, aka can be bitwise copied with a
+ *  facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  and \c false otherwise.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename FromIterator, typename ToIterator>
+constexpr bool is_indirectly_trivially_relocate_to_v
+  = is_indirectly_trivially_relocatable_to<FromIterator, ToIterator>::value;
+#endif
+
+/*! \brief <a href="http://eel.is/c++draft/namespace.std#def:customization_point"><i>customization point</i></a>
+ *  that can be specialized customized to indicate that a type \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka it can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE
+ */
+template <typename T>
+struct proclaim_trivially_relocatable : false_type {};
+
+/*! \brief Declares that the type \c T is
+ *  <a href="https://wg21.link/P1144"><i>TriviallyRelocatable</i></a>,
+ *  aka it can be bitwise copied with a facility like
+ *  <a href="https://en.cppreference.com/w/cpp/string/byte/memcpy"><tt>std::memcpy</tt></a>,
+ *  by specializing \c proclaim_trivially_relocatable.
+ *
+ * \see is_indirectly_trivially_relocatable_to
+ * \see is_trivially_relocatable
+ * \see is_trivially_relocatable_to
+ * \see proclaim_trivially_relocatable
+ */
+#define THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(T)                              \
+  THRUST_NAMESPACE_BEGIN                                                      \
+  template <>                                                                 \
+  struct proclaim_trivially_relocatable<T> : THRUST_NS_QUALIFIER::true_type   \
+  {};                                                                         \
+  THRUST_NAMESPACE_END                                                        \
+  /**/
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \cond
+ */
+
+namespace detail
+{
+
+// There is no way to actually detect the libstdc++ version; __GLIBCXX__
+// is always set to the date of libstdc++ being packaged, not the release
+// day or version. This means that we can't detect the libstdc++ version,
+// except when compiling with GCC.
+//
+// Therefore, for the best approximation of is_trivially_copyable, we need to
+// handle three distinct cases:
+// 1) GCC above 5, or another C++11 compiler not using libstdc++: use the
+//      standard trait directly.
+// 2) A C++11 compiler using libstdc++ that provides the intrinsic: use the
+//      intrinsic.
+// 3) Any other case (essentially: compiling without C++11): has_trivial_assign.
+
+#ifndef __has_feature
+    #define __has_feature(x) 0
+#endif
+
+template <typename T>
+struct is_trivially_copyable_impl
+    : integral_constant<
+        bool,
+        #if THRUST_CPP_DIALECT >= 2011
+            #if defined(__GLIBCXX__) && __has_feature(is_trivially_copyable)
+                __is_trivially_copyable(T)
+            #elif THRUST_HOST_COMPILER == THRUST_HOST_COMPILER_GCC && THRUST_GCC_VERSION >= 50000
+                std::is_trivially_copyable<T>::value
+            #else
+                has_trivial_assign<T>::value
+            #endif
+        #else
+            has_trivial_assign<T>::value
+        #endif
+    >
+{
+};
+
+// https://wg21.link/P1144R0#wording-inheritance
+template <typename T>
+struct is_trivially_relocatable_impl
+    : integral_constant<
+        bool,
+        is_trivially_copyable_impl<T>::value
+            || proclaim_trivially_relocatable<T>::value
+    >
+{};
+
+template <typename T, std::size_t N>
+struct is_trivially_relocatable_impl<T[N]> : is_trivially_relocatable_impl<T> {};
+
+} // namespace detail
+
+THRUST_NAMESPACE_END
+
+#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA
+
+#include <thrust/system/cuda/detail/guarded_cuda_runtime_api.h>
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(char4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uchar4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(short4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ushort4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(int4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(uint4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(long4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulong4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(longlong4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(ulonglong4)
+
+struct __half;
+struct __half2;
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(__half)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(__half2)
+
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(float4)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double1)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double2)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double3)
+THRUST_PROCLAIM_TRIVIALLY_RELOCATABLE(double4)
+#endif
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
diff --git a/thrust/type_traits/logical_metafunctions.h b/thrust/type_traits/logical_metafunctions.h
new file mode 100644
index 000000000..914b477e8
--- /dev/null
+++ b/thrust/type_traits/logical_metafunctions.h
@@ -0,0 +1,287 @@
+/*
+ *  Copyright 2008-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief C++17's
+ *  <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>,
+ *  <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>,
+ *  and <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ *  metafunctions and related extensions.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... && Ts::value)</tt>.
+ *
+ *  \see conjunction_v
+ *  \see conjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
+template <typename... Ts>
+using conjunction = std::conjunction<Ts...>;
+#else // Older than C++17.
+template <typename... Ts>
+struct conjunction;
+
+/*! \cond
+ */
+
+template <>
+struct conjunction<> : std::true_type {};
+
+template <typename T>
+struct conjunction<T> : T {};
+
+template <typename T0, typename T1>
+struct conjunction<T0, T1> : std::conditional<T0::value, T1, T0>::type {};
+
+template<typename T0, typename T1, typename T2, typename... TN>
+struct conjunction<T0, T1, T2, TN...>
+  : std::conditional<T0::value, conjunction<T1, T2, TN...>, T0>::type {};
+
+/*! \endcond
+ */
+#endif
+
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Ts::value)</tt>.
+ *
+ *  \see conjunction
+ *  \see conjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename... Ts>
+constexpr bool conjunction_v = conjunction<Ts...>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... || Ts::value)</tt>.
+ *
+ *  \see disjunction_v
+ *  \see disjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
+template <typename... Ts>
+using disjunction = std::disjunction<Ts...>;
+#else // Older than C++17.
+template <typename... Ts>
+struct disjunction;
+
+/*! \cond
+ */
+
+template <>
+struct disjunction<> : std::false_type {};
+
+template <typename T>
+struct disjunction<T> : T {};
+
+template <typename T0, typename... TN>
+struct disjunction<T0, TN...>
+  : std::conditional<T0::value != false, T0, disjunction<TN...> >::type {};
+
+/*! \endcond
+ */
+#endif
+
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Ts::value)</tt>.
+ *
+ *  \see disjunction
+ *  \see disjunction_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename... Ts>
+constexpr bool disjunction_v = disjunction<Ts...>::value;
+#endif
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation_v
+ *  \see negation_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2017
+template <typename T>
+using negation = std::negation<T>;
+#else // Older than C++17.
+template <typename T>
+struct negation;
+
+/*! \cond
+ */
+
+template <typename T>
+struct negation : std::integral_constant<bool, !T::value> {};
+
+/*! \endcond
+ */
+#endif
+
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation
+ *  \see negation_value
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+#if THRUST_CPP_DIALECT >= 2014
+template <typename T>
+constexpr bool negation_v = negation<T>::value;
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... && Bs)</tt>.
+ *
+ *  \see conjunction_value_v
+ *  \see conjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
+template <bool... Bs>
+struct conjunction_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... && Bs)</tt>.
+ *
+ *  \see conjunction_value
+ *  \see conjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/conjunction"><tt>std::conjunction</tt></a>
+ */
+template <bool... Bs>
+constexpr bool conjunction_value_v = conjunction_value<Bs...>::value;
+#endif
+
+/*! \cond
+ */
+
+template <>
+struct conjunction_value<> : std::true_type {};
+
+template <bool B>
+struct conjunction_value<B> : std::integral_constant<bool, B> {};
+
+template <bool B, bool... Bs>
+struct conjunction_value<B, Bs...>
+  : std::integral_constant<bool, B && conjunction_value<Bs...>::value> {};
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>(... || Bs)</tt>.
+ *
+ *  \see disjunction_value_v
+ *  \see disjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+template <bool... Bs>
+struct disjunction_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>(... || Bs)</tt>.
+ *
+ *  \see disjunction_value
+ *  \see disjunction
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/disjunction"><tt>std::disjunction</tt></a>
+ */
+template <bool... Bs>
+constexpr bool disjunction_value_v = disjunction_value<Bs...>::value;
+#endif
+
+/*! \cond
+ */
+
+template <>
+struct disjunction_value<> : std::false_type {};
+
+template <bool B>
+struct disjunction_value<B> : std::integral_constant<bool, B> {};
+
+template <bool B, bool... Bs>
+struct disjunction_value<B, Bs...>
+  : std::integral_constant<bool, B || disjunction_value<Bs...>::value> {};
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/types/integral_constant"><tt>std::integral_constant</tt></a>
+ *  whose value is <tt>!Bs</tt>.
+ *
+ *  \see negation_value_v
+ *  \see negation
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+template <bool B>
+struct negation_value;
+
+#if THRUST_CPP_DIALECT >= 2014
+/*! \brief <tt>constexpr bool</tt> whose value is <tt>!Ts::value</tt>.
+ *
+ *  \see negation_value
+ *  \see negation
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/negation"><tt>std::negation</tt></a>
+ */
+template <bool B>
+constexpr bool negation_value_v = negation_value<B>::value;
+#endif
+
+/*! \cond
+ */
+
+template <bool B>
+struct negation_value : std::integral_constant<bool, !B> {};
+
+/*! \endcond
+ */
+
+///////////////////////////////////////////////////////////////////////////////
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
+#endif // THRUST_CPP_DIALECT >= 2011
+
diff --git a/thrust/type_traits/remove_cvref.h b/thrust/type_traits/remove_cvref.h
new file mode 100644
index 000000000..1da2e0de3
--- /dev/null
+++ b/thrust/type_traits/remove_cvref.h
@@ -0,0 +1,97 @@
+/*
+ *  Copyright 2018-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief C++20's
+ *  <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if  THRUST_CPP_DIALECT >= 2017
+#if __has_include(<version>)
+#  include <version>
+#endif
+#endif
+
+#include <type_traits>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+/*! \brief <a href="https://en.cppreference.com/w/cpp/named_req/UnaryTypeTrait"><i>UnaryTypeTrait</i></a>
+ *  that removes
+ *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
+ *  and
+ *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
+ *  from \c T.
+ *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
+ */
+#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
+using std::remove_cvref;
+#else // Older than C++20.
+template <typename T>
+struct remove_cvref
+{
+  using type = typename std::remove_cv<
+    typename std::remove_reference<T>::type
+  >::type;
+};
+#endif
+
+/*! \brief Type alias that removes
+ *  <a href="https://en.cppreference.com/w/cpp/language/cv">const-volatile qualifiers</a>
+ *  and
+ *  <a href="https://en.cppreference.com/w/cpp/language/reference">references</a>
+ *  from \c T.
+ *  Equivalent to \c remove_cv_t<remove_reference_t<T>>.
+ *
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cvref">std::remove_cvref</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_cv</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_const</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_volatile</a>
+ *  \see <a href="https://en.cppreference.com/w/cpp/types/remove_cv">std::remove_reference</a>
+ */
+#if defined(__cpp_lib_remove_cvref) && (__cpp_lib_remove_cvref >= 201711L)
+using std::remove_cvref_t;
+#else // Older than C++20.
+template <typename T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+#endif
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/type_traits/void_t.h b/thrust/type_traits/void_t.h
new file mode 100644
index 000000000..ed12d861d
--- /dev/null
+++ b/thrust/type_traits/void_t.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2018-2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief C++17's `void_t`.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+#if THRUST_CPP_DIALECT >= 2017
+#  include <type_traits>
+#endif
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup utility
+ *  \{
+ */
+
+/*! \addtogroup type_traits Type Traits
+ *  \{
+ */
+
+#if THRUST_CPP_DIALECT >= 2011
+
+template <typename...> struct voider { using type = void; };
+
+#if THRUST_CPP_DIALECT >= 2017
+using std::void_t;
+#else
+template <typename... Ts> using void_t = typename voider<Ts...>::type;
+#endif
+
+#else // Older than C++11.
+
+template <
+  typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+, typename = void
+>
+struct voider
+{
+  typedef void type;
+};
+
+#endif
+
+/*! \} // type traits
+ */
+
+/*! \} // utility
+ */
+
+THRUST_NAMESPACE_END
+
diff --git a/thrust/uninitialized_copy.h b/thrust/uninitialized_copy.h
index af0f641a7..94c2763e3 100644
--- a/thrust/uninitialized_copy.h
+++ b/thrust/uninitialized_copy.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup copying
  *  \{
@@ -52,8 +50,8 @@ namespace thrust
  *  \return An iterator pointing to the last element of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -87,7 +85,7 @@ namespace thrust
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
  *  \see \c device_new
@@ -116,8 +114,8 @@ __host__ __device__
  *  \param result The first element of the output range to copy to.
  *  \return An iterator pointing to the last element of the output range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -149,7 +147,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
  *  \see \c device_new
@@ -180,9 +178,9 @@ template<typename InputIterator, typename ForwardIterator>
  *  \return An iterator pointing to the last element of the output range.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -216,7 +214,7 @@ template<typename InputIterator, typename ForwardIterator>
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
@@ -246,9 +244,9 @@ __host__ __device__
  *  \param result The first element of the output range to copy to.
  *  \return An iterator pointing to the last element of the output range.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>.
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>.
  *  \tparam Size is an integral type.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that takes
  *          a single argument whose type is \p InputIterator's \c value_type.
  *
@@ -280,7 +278,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_copy.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_copy
  *  \see \c uninitialized_copy
  *  \see \c copy
  *  \see \c uninitialized_fill
@@ -296,8 +294,6 @@ template<typename InputIterator, typename Size, typename ForwardIterator>
 /*! \} // copying
  */
 
-
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/uninitialized_copy.inl>
-
diff --git a/thrust/uninitialized_fill.h b/thrust/uninitialized_fill.h
index a73188d6d..b46758a3c 100644
--- a/thrust/uninitialized_fill.h
+++ b/thrust/uninitialized_fill.h
@@ -24,9 +24,7 @@
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup filling
  *  \ingroup transformations
@@ -38,7 +36,7 @@ namespace thrust
  *  an object and then creates an object at that location by calling a
  *  constructor. Occasionally, however, it is useful to separate those two
  *  operations. If each iterator in the range <tt>[first, last)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
  *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
  *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
  *  calling \p ForwardIterator's \c value_type's copy constructor.
@@ -51,7 +49,7 @@ namespace thrust
  *  \param x The value to use as the exemplar of the copy constructor.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -80,7 +78,7 @@ namespace thrust
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill_n
  *  \see \c fill
  *  \see \c uninitialized_copy
@@ -99,7 +97,7 @@ __host__ __device__
  *  an object and then creates an object at that location by calling a
  *  constructor. Occasionally, however, it is useful to separate those two
  *  operations. If each iterator in the range <tt>[first, last)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
  *  in that range. That is, for each iterator \c i in the range <tt>[first, last)</tt>,
  *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
  *  calling \p ForwardIterator's \c value_type's copy constructor.
@@ -108,7 +106,7 @@ __host__ __device__
  *  \param last The last element of the range of interest.
  *  \param x The value to use as the exemplar of the copy constructor.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -136,7 +134,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill_n
  *  \see \c fill
  *  \see \c uninitialized_copy
@@ -153,7 +151,7 @@ template<typename ForwardIterator, typename T>
  *  an object and then creates an object at that location by calling a
  *  constructor. Occasionally, however, it is useful to separate those two
  *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
  *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
  *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
  *  calling \p ForwardIterator's \c value_type's copy constructor.
@@ -167,7 +165,7 @@ template<typename ForwardIterator, typename T>
  *  \return <tt>first+n</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -196,7 +194,7 @@ template<typename ForwardIterator, typename T>
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill
  *  \see \c fill
  *  \see \c uninitialized_copy_n
@@ -215,7 +213,7 @@ __host__ __device__
  *  an object and then creates an object at that location by calling a
  *  constructor. Occasionally, however, it is useful to separate those two
  *  operations. If each iterator in the range <tt>[first, first+n)</tt> points
- *  to unitialized memory, then \p unitialized_fill creates copies of \c x
+ *  to uninitialized memory, then \p uninitialized_fill creates copies of \c x
  *  in that range. That is, for each iterator \c i in the range <tt>[first, first+n)</tt>,
  *  \p uninitialized_fill creates a copy of \c x in the location pointed to \c i by
  *  calling \p ForwardIterator's \c value_type's copy constructor.
@@ -225,7 +223,7 @@ __host__ __device__
  *  \param x The value to use as the exemplar of the copy constructor.
  *  \return <tt>first+n</tt>
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          \p ForwardIterator is mutable, and \p ForwardIterator's \c value_type has a constructor that
  *          takes a single argument of type \p T.
  *
@@ -253,7 +251,7 @@ __host__ __device__
  *  // x.val == 46 for all 0 <= i < N
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/uninitialized_fill.html
+ *  \see https://en.cppreference.com/w/cpp/memory/uninitialized_fill
  *  \see \c uninitialized_fill
  *  \see \c fill
  *  \see \c uninitialized_copy_n
@@ -269,7 +267,6 @@ template<typename ForwardIterator, typename Size, typename T>
  *  \} // transformations
  */
 
-} // end thrust
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/uninitialized_fill.inl>
-
diff --git a/thrust/unique.h b/thrust/unique.h
index b4b2118d3..234cd4935 100644
--- a/thrust/unique.h
+++ b/thrust/unique.h
@@ -23,11 +23,10 @@
 
 #include <thrust/detail/config.h>
 #include <thrust/detail/execution_policy.h>
+#include <thrust/iterator/iterator_traits.h>
 #include <thrust/pair.h>
 
-namespace thrust
-{
-
+THRUST_NAMESPACE_BEGIN
 
 /*! \addtogroup stream_compaction
  *  \{
@@ -53,9 +52,9 @@ namespace thrust
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
@@ -72,7 +71,7 @@ namespace thrust
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename DerivedPolicy,
@@ -98,9 +97,9 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \param last  The end of the input range.
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates.
@@ -115,7 +114,7 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename ForwardIterator>
@@ -144,10 +143,10 @@ ForwardIterator unique(ForwardIterator first,
  *  \return The end of the unique range <tt>[first, new_last)</tt>
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates using the \p thrust::host execution policy
@@ -164,7 +163,7 @@ ForwardIterator unique(ForwardIterator first,
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename DerivedPolicy,
@@ -194,10 +193,10 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[first, new_last)</tt>
  *
- *  \tparam ForwardIterator is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator is mutable,
  *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  The following code snippet demonstrates how to use \p unique to
  *  compact a sequence of numbers to remove consecutive duplicates.
@@ -212,7 +211,7 @@ ForwardIterator unique(const thrust::detail::execution_policy_base<DerivedPolicy
  *  // Values beyond new_end are unspecified.
  *  \endcode
  *
- *  \see http://www.sgi.com/tech/stl/unique.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique
  *  \see unique_copy
  */
 template<typename ForwardIterator,
@@ -248,9 +247,9 @@ ForwardIterator unique(ForwardIterator first,
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -272,7 +271,7 @@ ForwardIterator unique(ForwardIterator first,
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -306,9 +305,9 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \param result The beginning of the output range.
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
@@ -328,7 +327,7 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename InputIterator,
          typename OutputIterator>
@@ -355,11 +354,11 @@ OutputIterator unique_copy(InputIterator first,
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -380,7 +379,7 @@ OutputIterator unique_copy(InputIterator first,
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename DerivedPolicy,
          typename InputIterator,
@@ -408,11 +407,11 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[result, result_end)</tt>.
  *
- *  \tparam InputIterator is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *          and \p InputIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam OutputIterator is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *          and \p InputIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam OutputIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator's \c value_type is convertible to \c OutputIterator's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[first,last)</tt> and the range <tt>[result, result + (last - first))</tt> shall not overlap.
  *
@@ -431,7 +430,7 @@ OutputIterator unique_copy(const thrust::detail::execution_policy_base<DerivedPo
  *  \endcode
  *
  *  \see unique
- *  \see http://www.sgi.com/tech/stl/unique_copy.html
+ *  \see https://en.cppreference.com/w/cpp/algorithm/unique_copy
  */
 template<typename InputIterator,
          typename OutputIterator,
@@ -465,10 +464,10 @@ OutputIterator unique_copy(InputIterator first,
  *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
@@ -526,10 +525,10 @@ __host__ __device__
  *  \param values_first The beginning of the value range.
  *  \return A pair of iterators at end of the ranges <tt>[key_first, keys_new_last)</tt> and <tt>[values_first, values_new_last)</tt>.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
@@ -583,12 +582,12 @@ template<typename ForwardIterator1,
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
  *
@@ -645,12 +644,12 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return The end of the unique range <tt>[first, new_last)</tt>.
  *
- *  \tparam ForwardIterator1 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *  \tparam ForwardIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator1 is mutable,
- *          and \p ForwardIterator's \c value_type is a model of <a href="http://www.sgi.com/tech/stl/EqualityComparable.html">Equality Comparable</a>.
- *  \tparam ForwardIterator2 is a model of <a href="http://www.sgi.com/tech/stl/ForwardIterator.html">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is a model of <a href="https://en.cppreference.com/w/cpp/concepts/equality_comparable">Equality Comparable</a>.
+ *  \tparam ForwardIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
  *          and \p ForwardIterator2 is mutable.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The range <tt>[keys_first, keys_last)</tt> and the range <tt>[values_first, values_first + (keys_last - keys_first))</tt> shall not overlap.
  *
@@ -707,11 +706,11 @@ template<typename ForwardIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -773,11 +772,11 @@ __host__ __device__
  *  \param values_result The beginning of the output value range.
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
  *
  *  \pre The input ranges shall not overlap either output range.
@@ -839,13 +838,13 @@ template<typename InputIterator1,
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
  *  \tparam DerivedPolicy The name of the derived execution policy.
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -910,13 +909,13 @@ __host__ __device__
  *  \param binary_pred  The binary predicate used to determine equality.
  *  \return A pair of iterators at end of the ranges <tt>[keys_result, keys_result_last)</tt> and <tt>[values_result, values_result_last)</tt>.
  *
- *  \tparam InputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam InputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/InputIterator.html">Input Iterator</a>,
- *  \tparam OutputIterator1 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam InputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam InputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/input_iterator">Input Iterator</a>,
+ *  \tparam OutputIterator1 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator1's \c value_type is convertible to \c OutputIterator1's \c value_type.
- *  \tparam OutputIterator2 is a model of <a href="http://www.sgi.com/tech/stl/OutputIterator.html">Output Iterator</a> and
+ *  \tparam OutputIterator2 is a model of <a href="https://en.cppreference.com/w/cpp/iterator/output_iterator">Output Iterator</a> and
  *          and \p InputIterator2's \c value_type is convertible to \c OutputIterator2's \c value_type.
- *  \tparam BinaryPredicate is a model of <a href="http://www.sgi.com/tech/stl/BinaryPredicate.html">Binary Predicate</a>.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
  *
  *  \pre The input ranges shall not overlap either output range.
  *
@@ -958,11 +957,184 @@ template<typename InputIterator1,
                      BinaryPredicate binary_pred);
 
 
-/*! \} // end stream_compaction
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses the function object \p binary_pred to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine a number of runs of equal elements using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N, thrust::equal_to<int>());
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses \c operator== to test for equality.
+ *
+ *  The algorithm's execution is parallelized as determined by \p exec.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements using the \p thrust::host execution policy
+ *  for parallelization:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N);
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename DerivedPolicy,
+         typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(const thrust::detail::execution_policy_base<DerivedPolicy> &exec,
+                 ForwardIterator first,
+                 ForwardIterator last);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses the function object \p binary_pred to test for equality.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(A, A + N, thrust::equal_to<int>());
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
+ */
+template<typename ForwardIterator,
+         typename BinaryPredicate>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last,
+                 BinaryPredicate binary_pred);
+
+
+/*! \p unique_count counts runs of equal elements in the range <tt>[first, last)</tt>
+ *  with the same value, 
+ *
+ *  This version of \p unique_count uses \c operator== to test for equality.
+ *
+ *  \param exec The execution policy to use for parallelization.
+ *  \param first The beginning of the input range.
+ *  \param last  The end of the input range.
+ *  \param binary_pred  The binary predicate used to determine equality.
+ *  \return The number of runs of equal elements in <tt>[first, new_last)</tt>
+ *
+ *  \tparam DerivedPolicy The name of the derived execution policy.
+ *  \tparam ForwardIterator is a model of <a href="https://en.cppreference.com/w/cpp/iterator/forward_iterator">Forward Iterator</a>,
+ *          and \p ForwardIterator's \c value_type is convertible to \p BinaryPredicate's \c first_argument_type and to \p BinaryPredicate's \c second_argument_type.
+ *  \tparam BinaryPredicate is a model of <a href="https://en.cppreference.com/w/cpp/named_req/BinaryPredicate">Binary Predicate</a>.
+ *
+ *  The following code snippet demonstrates how to use \p unique_count to
+ *  determine the number of runs of equal elements:
+ *
+ *  \code
+ *  #include <thrust/unique.h>
+ *  #include <thrust/execution_policy.h>
+ *  ...
+ *  const int N = 7;
+ *  int A[N] = {1, 3, 3, 3, 2, 2, 1};
+ *  int count = thrust::unique_count(thrust::host, A, A + N);
+ *  // count is now 4
+ *  \endcode
+ *
+ *  \see unique_copy
+ *  \see unique_by_key_copy
+ *  \see reduce_by_key_copy
  */
+template<typename ForwardIterator>
+__host__ __device__
+  typename thrust::iterator_traits<ForwardIterator>::difference_type
+    unique_count(ForwardIterator first,
+                 ForwardIterator last);
 
 
-} // end namespace thrust
+/*! \} // end stream_compaction
+ */
+
+THRUST_NAMESPACE_END
 
 #include <thrust/detail/unique.inl>
 
diff --git a/thrust/universal_allocator.h b/thrust/universal_allocator.h
new file mode 100644
index 000000000..8d85cd20d
--- /dev/null
+++ b/thrust/universal_allocator.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+
+/*! \file universal_allocator.h
+ *  \brief An allocator which creates new elements in memory accessible to both
+ *         hosts and devices.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+
+// #include the device system's vector header
+#define __THRUST_DEVICE_SYSTEM_MEMORY_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/memory.h>
+#include __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+#undef __THRUST_DEVICE_SYSTEM_MEMORY_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+/** \addtogroup memory_resources Memory Resources
+ *  \ingroup memory_management_classes
+ *  \{
+ */
+
+/*! \brief An allocator which creates new elements in memory accessible by
+ *         both hosts and devices.
+ *
+ *  \see https://en.cppreference.com/w/cpp/named_req/Allocator
+ */
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_allocator;
+
+/*! \p universal_ptr stores a pointer to an object allocated in memory accessible
+ *  to both hosts and devices.
+ *
+ *  Algorithms dispatched with this type of pointer will be dispatched to
+ *  either host or device, depending on which backend you are using. Explicit
+ *  policies (\p thrust::device, etc) can be used to specify where an algorithm
+ *  should be run.
+ *
+ *  \p universal_ptr has pointer semantics: it may be dereferenced safely from
+ *  both hosts and devices and may be manipulated with pointer arithmetic.
+ *
+ *  \p universal_ptr can be created with \p universal_allocator or by explicitly
+ *  calling its constructor with a raw pointer.
+ *
+ *  The raw pointer encapsulated by a \p universal_ptr may be obtained by
+ *  either its <tt>get</tt> method or the \p raw_pointer_cast free function.
+ *
+ *  \note \p universal_ptr is not a smart pointer; it is the programmer's
+ *  responsibility to deallocate memory pointed to by \p universal_ptr.
+ *
+ *  \see host_ptr For the documentation of the complete interface which is
+ *                shared by \p universal_ptr.
+ *  \see raw_pointer_cast
+ */
+template <typename T>
+using universal_ptr =
+  thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_pointer<T>;
+
+/*! \}
+ */
+
+THRUST_NAMESPACE_END
diff --git a/thrust/detail/host_vector.inl b/thrust/universal_ptr.h
similarity index 57%
rename from thrust/detail/host_vector.inl
rename to thrust/universal_ptr.h
index e424dd1e1..9d1de19d5 100644
--- a/thrust/detail/host_vector.inl
+++ b/thrust/universal_ptr.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2020 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -15,24 +15,12 @@
  */
 
 
-/*! \file host_vector.inl
- *  \brief Inline file for host_vector.h.
+/*! \file universal_ptr.h
+ *  \brief A pointer to a variable which resides memory accessible to both
+ *         hosts and devices.
  */
 
-#include <thrust/host_vector.h>
+#pragma once
 
-namespace thrust
-{
-
-template<typename T, typename Alloc>
-  template<typename OtherT, typename OtherAlloc>
-    __host__
-    host_vector<T,Alloc>
-      ::host_vector(const device_vector<OtherT,OtherAlloc> &v)
-        :Parent(v)
-{
-  ;
-} // end host_vector::host_vector()
-
-} // end namespace thrust
+#include <thrust/universal_allocator.h>
 
diff --git a/thrust/universal_vector.h b/thrust/universal_vector.h
new file mode 100644
index 000000000..0ce38fd86
--- /dev/null
+++ b/thrust/universal_vector.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright 2008-2020 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+/*! \file
+ *  \brief A dynamically-sizable array of elements which resides in memory
+ *         accessible to both hosts and devices.
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/universal_allocator.h>
+
+// #include the device system's vector header
+#define __THRUST_DEVICE_SYSTEM_VECTOR_HEADER <__THRUST_DEVICE_SYSTEM_ROOT/vector.h>
+#include __THRUST_DEVICE_SYSTEM_VECTOR_HEADER
+#undef __THRUST_DEVICE_SYSTEM_VECTOR_HEADER
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup containers Containers
+ *  \{
+ */
+
+/*! A \p universal_vector is a container that supports random access to elements,
+ *  constant time removal of elements at the end, and linear time insertion
+ *  and removal of elements at the beginning or in the middle. The number of
+ *  elements in a \p universal_vector may vary dynamically; memory management is
+ *  automatic. The memory associated with a \p universal_vector resides in memory
+ *  accessible to hosts and devices.
+ *
+ *  \see https://en.cppreference.com/w/cpp/container/vector
+ *  \see host_vector For the documentation of the complete interface which is
+ *                   shared by \p universal_vector.
+ *  \see device_vector
+ */
+using thrust::system::__THRUST_DEVICE_SYSTEM_NAMESPACE::universal_vector;
+
+/*! \} // containers
+ */
+
+THRUST_NAMESPACE_END
diff --git a/thrust/version.h b/thrust/version.h
index d21b7c407..71f1adb69 100644
--- a/thrust/version.h
+++ b/thrust/version.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright 2008-2013 NVIDIA Corporation
+ *  Copyright 2008-2022 NVIDIA Corporation
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 
 #pragma once
 
-//  This is the only Thrust header that is guaranteed to 
+//  This is the only Thrust header that is guaranteed to
 //  change with every Thrust release.
 //
 //  THRUST_VERSION % 100 is the sub-minor version
@@ -47,7 +47,7 @@
  *         <tt>THRUST_VERSION / 100 % 1000</tt> is the minor version.
  *         <tt>THRUST_VERSION / 100000</tt> is the major version.
  */
-#define THRUST_VERSION 100803
+#define THRUST_VERSION 200200
 
 /*! \def THRUST_MAJOR_VERSION
  *  \brief The preprocessor macro \p THRUST_MAJOR_VERSION encodes the
@@ -67,14 +67,9 @@
  */
 #define THRUST_SUBMINOR_VERSION  (THRUST_VERSION % 100)
 
-// Declare these namespaces here for the purpose of Doxygenating them
-
-/*! \namespace thrust
- *  \brief \p thrust is the top-level namespace which contains all Thrust
- *         functions and types.
+/*! \def THRUST_PATCH_NUMBER
+ *  \brief The preprocessor macro \p THRUST_PATCH_NUMBER encodes the
+ *         patch number of the Thrust library.
+ *         Legacy; will be 0 for all future releases.
  */
-namespace thrust
-{
-
-}
-
+#define THRUST_PATCH_NUMBER 0
diff --git a/thrust/zip_function.h b/thrust/zip_function.h
new file mode 100644
index 000000000..7653f9b7f
--- /dev/null
+++ b/thrust/zip_function.h
@@ -0,0 +1,212 @@
+
+/*! \file thrust/zip_function.h
+ *  \brief Adaptor type that turns an N-ary function object into one that takes
+ *         a tuple of size N so it can easily be used with algorithms taking zip
+ *         iterators
+ */
+
+#pragma once
+
+#include <thrust/detail/config.h>
+#include <thrust/detail/cpp11_required.h>
+#include <thrust/detail/modern_gcc_required.h>
+
+#if THRUST_CPP_DIALECT >= 2011 && !defined(THRUST_LEGACY_GCC)
+
+#include <thrust/tuple.h>
+#include <thrust/type_traits/integer_sequence.h>
+#include <thrust/detail/type_deduction.h>
+
+THRUST_NAMESPACE_BEGIN
+
+/*! \addtogroup function_objects Function Objects
+ *  \{
+ */
+
+/*! \addtogroup function_object_adaptors Function Object Adaptors
+ *  \ingroup function_objects
+ *  \{
+ */
+
+namespace detail {
+namespace zip_detail {
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+__thrust_exec_check_disable__
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+decltype(auto) apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+{
+  return func(thrust::get<Is>(THRUST_FWD(args))...);
+}
+
+template <typename Function, typename Tuple>
+__host__ __device__
+decltype(auto) apply(Function&& func, Tuple&& args)
+{
+  constexpr auto tuple_size = thrust::tuple_size<typename std::decay<Tuple>::type>::value;
+  return apply_impl(THRUST_FWD(func), THRUST_FWD(args), make_index_sequence<tuple_size>{});
+}
+
+#else // THRUST_CPP_DIALECT
+
+__thrust_exec_check_disable__
+template <typename Function, typename Tuple, std::size_t... Is>
+__host__ __device__
+auto apply_impl(Function&& func, Tuple&& args, index_sequence<Is...>)
+THRUST_DECLTYPE_RETURNS(func(thrust::get<Is>(THRUST_FWD(args))...))
+
+template <typename Function, typename Tuple>
+__host__ __device__
+auto apply(Function&& func, Tuple&& args)
+THRUST_DECLTYPE_RETURNS(
+    apply_impl(
+      THRUST_FWD(func),
+      THRUST_FWD(args),
+      make_index_sequence<
+        thrust::tuple_size<typename std::decay<Tuple>::type>::value>{})
+)
+
+#endif // THRUST_CPP_DIALECT
+
+} // namespace zip_detail
+} // namespace detail
+
+/*! \p zip_function is a function object that allows the easy use of N-ary
+ *  function objects with \p zip_iterators without redefining them to take a
+ *  \p tuple instead of N arguments.
+ *
+ *  This means that if a functor that takes 2 arguments which could be used with
+ *  the \p transform function and \p device_iterators can be extended to take 3
+ *  arguments and \p zip_iterators without rewriting the functor in terms of
+ *  \p tuple.
+ *
+ *  The \p make_zip_function convenience function is provided to avoid having
+ *  to explicitely define the type of the functor when creating a \p zip_function,
+ *  whic is especially helpful when using lambdas as the functor.
+ *
+ *  \code
+ *  #include <thrust/iterator/zip_iterator.h>
+ *  #include <thrust/device_vector.h>
+ *  #include <thrust/transform.h>
+ *  #include <thrust/zip_function.h>
+ *
+ *  struct SumTuple {
+ *    float operator()(Tuple tup) {
+ *      return std::get<0>(tup) + std::get<1>(tup) + std::get<2>(tup);
+ *    }
+ *  };
+ *  struct SumArgs {
+ *    float operator()(float a, float b, float c) {
+ *      return a + b + c;
+ *    }
+ *  };
+ *
+ *  int main() {
+ *    thrust::device_vector<float> A(3);
+ *    thrust::device_vector<float> B(3);
+ *    thrust::device_vector<float> C(3);
+ *    thrust::device_vector<float> D(3);
+ *    A[0] = 0.f; A[1] = 1.f; A[2] = 2.f;
+ *    B[0] = 1.f; B[1] = 2.f; B[2] = 3.f;
+ *    C[0] = 2.f; C[1] = 3.f; C[2] = 4.f;
+ *
+ *    // The following four invocations of transform are equivalent
+ *    // Transform with 3-tuple
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      SumTuple{});
+ *
+ *    // Transform with 3 parameters
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      adapted);
+ *
+ *    // Transform with 3 parameters with convenience function
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function(SumArgs{}));
+ *
+ *    // Transform with 3 parameters with convenience function and lambda
+ *    thrust::zip_function<SumArgs> adapted{};
+ *    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(A.begin(), B.begin(), C.begin())),
+ *                      thrust::make_zip_iterator(thrust::make_tuple(A.end(), B.end(), C.end())),
+ *                      D.begin(),
+ *                      thrust::make_zip_function([] (float a, float b, float c) {
+ *                                                  return a + b + c;
+ *                                                }));
+ *    return 0;
+ *  }
+ *  \endcode
+ *
+ *  \see make_zip_function
+ *  \see zip_iterator
+ */
+template <typename Function>
+class zip_function
+{
+  public:
+     __host__ __device__
+    zip_function(Function func) : func(std::move(func)) {}
+
+// Add workaround for decltype(auto) on C++11-only compilers:
+#if THRUST_CPP_DIALECT >= 2014
+
+    template <typename Tuple>
+    __host__ __device__
+    decltype(auto) operator()(Tuple&& args) const
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#else // THRUST_CPP_DIALECT
+
+    // Can't just use THRUST_DECLTYPE_RETURNS here since we need to use
+    // std::declval for the signature components:
+    template <typename Tuple>
+    __host__ __device__
+    auto operator()(Tuple&& args) const
+    noexcept(noexcept(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args))))
+    THRUST_TRAILING_RETURN(decltype(detail::zip_detail::apply(std::declval<Function>(), THRUST_FWD(args))))
+    {
+        return detail::zip_detail::apply(func, THRUST_FWD(args));
+    }
+
+#endif // THRUST_CPP_DIALECT
+
+  private:
+    mutable Function func;
+};
+
+/*! \p make_zip_function creates a \p zip_function from a function object.
+ *
+ *  \param fun The N-ary function object.
+ *  \return A \p zip_function that takes a N-tuple.
+ *
+ *  \see zip_function
+ */
+template <typename Function>
+__host__ __device__
+zip_function<typename std::decay<Function>::type>
+make_zip_function(Function&& fun)
+{
+    using func_t = typename std::decay<Function>::type;
+    return zip_function<func_t>(THRUST_FWD(fun));
+}
+
+/*! \} // end function_object_adaptors
+ */
+
+/*! \} // end function_objects
+ */
+
+THRUST_NAMESPACE_END
+
+#endif